From a83d5c6e0bbfe475ebc6dc999d788cfa0b3916fd Mon Sep 17 00:00:00 2001 From: mayor Date: Sat, 3 Jan 2026 09:01:34 -0800 Subject: [PATCH] fix: Add tmux health check fallback to prevent killing healthy sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the daemon checks if Deacon/Witness is running, it first checks the agent bead state. If this check fails (bead not found, JSON parse error, or stale state), it would previously attempt to restart the session - even if the tmux session was perfectly healthy. This caused "session already exists" errors when: 1. Agent bead state couldn't be read (prefix mismatch, missing bead) 2. But the tmux session was actually running with Claude active Fix: Add a tmux session health check as fallback before attempting restart. If the session exists AND Claude is running in it, skip the restart and log that we're preserving the healthy session despite stale bead state. This maintains ZFC compliance (still trusts agent bead as primary source) while adding a defensive check to prevent unnecessary session kills. Fixes #63 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- internal/daemon/daemon.go | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 439a831b..72819d35 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -261,7 +261,7 @@ func (d *Daemon) runDegradedBootTriage(b *boot.Boot) { } // ensureDeaconRunning ensures the Deacon is running. -// ZFC-compliant: trusts agent bead state, no tmux inference. +// ZFC-compliant: trusts agent bead state, with tmux health check fallback. // The Deacon is the system's heartbeat - it must always be running. func (d *Daemon) ensureDeaconRunning() { // Check agent bead state (ZFC: trust what agent reports) @@ -273,7 +273,19 @@ func (d *Daemon) ensureDeaconRunning() { return } } - // Agent not running (or bead not found) - start it + + // Agent bead check failed or state is not running. + // FALLBACK: Check if tmux session is actually healthy before attempting restart. + // This prevents killing healthy sessions when bead state is stale or unreadable. + hasSession, sessionErr := d.tmux.HasSession(DeaconSessionName) + if sessionErr == nil && hasSession { + if d.tmux.IsClaudeRunning(DeaconSessionName) { + d.logger.Println("Deacon session healthy (Claude running), skipping restart despite stale bead") + return + } + } + + // Agent not running (or bead not found) AND session is not healthy - start it d.logger.Println("Deacon not running per agent bead, starting...") // Create session in deacon directory (ensures correct CLAUDE.md is loaded) @@ -371,7 +383,21 @@ func (d *Daemon) ensureWitnessRunning(rigName string) { } } - // Agent not running (or bead not found) - start it + // Agent bead check failed or state is not running. + // FALLBACK: Check if tmux session is actually healthy before attempting restart. + // This prevents killing healthy sessions when bead state is stale or unreadable. + hasSession, sessionErr := d.tmux.HasSession(sessionName) + if sessionErr == nil && hasSession { + // Session exists - check if Claude is actually running in it + if d.tmux.IsClaudeRunning(sessionName) { + // Session is healthy - don't restart it + // The bead state may be stale; agent will update it on next activity + d.logger.Printf("Witness for %s session healthy (Claude running), skipping restart despite stale bead", rigName) + return + } + } + + // Agent not running (or bead not found) AND session is not healthy - start it d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName) // Create session in witness directory