Merge pull request #64 from dannomayernotabot/fix/daemon-witness-race-condition
fix: Add tmux health check fallback to prevent killing healthy sessions
This commit is contained in:
@@ -261,7 +261,7 @@ func (d *Daemon) runDegradedBootTriage(b *boot.Boot) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ensureDeaconRunning ensures the Deacon is running.
|
// ensureDeaconRunning ensures the Deacon is running.
|
||||||
// ZFC-compliant: trusts agent bead state, no tmux inference.
|
// ZFC-compliant: trusts agent bead state, with tmux health check fallback.
|
||||||
// The Deacon is the system's heartbeat - it must always be running.
|
// The Deacon is the system's heartbeat - it must always be running.
|
||||||
func (d *Daemon) ensureDeaconRunning() {
|
func (d *Daemon) ensureDeaconRunning() {
|
||||||
// Check agent bead state (ZFC: trust what agent reports)
|
// Check agent bead state (ZFC: trust what agent reports)
|
||||||
@@ -273,7 +273,19 @@ func (d *Daemon) ensureDeaconRunning() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Agent not running (or bead not found) - start it
|
|
||||||
|
// Agent bead check failed or state is not running.
|
||||||
|
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
|
||||||
|
// This prevents killing healthy sessions when bead state is stale or unreadable.
|
||||||
|
hasSession, sessionErr := d.tmux.HasSession(DeaconSessionName)
|
||||||
|
if sessionErr == nil && hasSession {
|
||||||
|
if d.tmux.IsClaudeRunning(DeaconSessionName) {
|
||||||
|
d.logger.Println("Deacon session healthy (Claude running), skipping restart despite stale bead")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Agent not running (or bead not found) AND session is not healthy - start it
|
||||||
d.logger.Println("Deacon not running per agent bead, starting...")
|
d.logger.Println("Deacon not running per agent bead, starting...")
|
||||||
|
|
||||||
// Create session in deacon directory (ensures correct CLAUDE.md is loaded)
|
// Create session in deacon directory (ensures correct CLAUDE.md is loaded)
|
||||||
@@ -371,7 +383,21 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Agent not running (or bead not found) - start it
|
// Agent bead check failed or state is not running.
|
||||||
|
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
|
||||||
|
// This prevents killing healthy sessions when bead state is stale or unreadable.
|
||||||
|
hasSession, sessionErr := d.tmux.HasSession(sessionName)
|
||||||
|
if sessionErr == nil && hasSession {
|
||||||
|
// Session exists - check if Claude is actually running in it
|
||||||
|
if d.tmux.IsClaudeRunning(sessionName) {
|
||||||
|
// Session is healthy - don't restart it
|
||||||
|
// The bead state may be stale; agent will update it on next activity
|
||||||
|
d.logger.Printf("Witness for %s session healthy (Claude running), skipping restart despite stale bead", rigName)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Agent not running (or bead not found) AND session is not healthy - start it
|
||||||
d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName)
|
d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName)
|
||||||
|
|
||||||
// Create session in witness directory
|
// Create session in witness directory
|
||||||
|
|||||||
Reference in New Issue
Block a user