Merge pull request #64 from dannomayernotabot/fix/daemon-witness-race-condition

fix: Add tmux health check fallback to prevent killing healthy sessions
This commit is contained in:
Steve Yegge
2026-01-03 11:48:44 -08:00
committed by GitHub

View File

@@ -261,7 +261,7 @@ func (d *Daemon) runDegradedBootTriage(b *boot.Boot) {
} }
// ensureDeaconRunning ensures the Deacon is running. // ensureDeaconRunning ensures the Deacon is running.
// ZFC-compliant: trusts agent bead state, no tmux inference. // ZFC-compliant: trusts agent bead state, with tmux health check fallback.
// The Deacon is the system's heartbeat - it must always be running. // The Deacon is the system's heartbeat - it must always be running.
func (d *Daemon) ensureDeaconRunning() { func (d *Daemon) ensureDeaconRunning() {
// Check agent bead state (ZFC: trust what agent reports) // Check agent bead state (ZFC: trust what agent reports)
@@ -273,7 +273,19 @@ func (d *Daemon) ensureDeaconRunning() {
return return
} }
} }
// Agent not running (or bead not found) - start it
// Agent bead check failed or state is not running.
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
// This prevents killing healthy sessions when bead state is stale or unreadable.
hasSession, sessionErr := d.tmux.HasSession(DeaconSessionName)
if sessionErr == nil && hasSession {
if d.tmux.IsClaudeRunning(DeaconSessionName) {
d.logger.Println("Deacon session healthy (Claude running), skipping restart despite stale bead")
return
}
}
// Agent not running (or bead not found) AND session is not healthy - start it
d.logger.Println("Deacon not running per agent bead, starting...") d.logger.Println("Deacon not running per agent bead, starting...")
// Create session in deacon directory (ensures correct CLAUDE.md is loaded) // Create session in deacon directory (ensures correct CLAUDE.md is loaded)
@@ -371,7 +383,21 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
} }
} }
// Agent not running (or bead not found) - start it // Agent bead check failed or state is not running.
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
// This prevents killing healthy sessions when bead state is stale or unreadable.
hasSession, sessionErr := d.tmux.HasSession(sessionName)
if sessionErr == nil && hasSession {
// Session exists - check if Claude is actually running in it
if d.tmux.IsClaudeRunning(sessionName) {
// Session is healthy - don't restart it
// The bead state may be stale; agent will update it on next activity
d.logger.Printf("Witness for %s session healthy (Claude running), skipping restart despite stale bead", rigName)
return
}
}
// Agent not running (or bead not found) AND session is not healthy - start it
d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName) d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName)
// Create session in witness directory // Create session in witness directory