fix: Add grace period to prevent Deacon restart loop (#590)
* fix(daemon): prevent runaway refinery session spawning Fixes #566 The daemon spawned 812 refinery sessions over 4 days because: 1. Zombie detection was too strict - used IsAgentRunning(session, "node") but Claude reports pane command as version number (e.g., "2.1.7"), causing healthy sessions to be killed and recreated every heartbeat. 2. daemon.json patrol config was completely ignored - the daemon never loaded or checked the enabled flags. Changes: - refinery/manager.go: Use IsClaudeRunning() instead of IsAgentRunning() for robust Claude detection (handles "node", "claude", version patterns) - daemon/types.go: Add PatrolConfig types and LoadPatrolConfig() to read mayor/daemon.json - daemon/daemon.go: Load patrol config at startup, check enabled flags before calling ensureRefineriesRunning/ensureWitnessesRunning, add diagnostic logging for "already running" cases Tested: Verified over multiple heartbeats that refinery shows "already running, skipping spawn" instead of spawning new sessions. * fix: Add grace period to prevent Deacon restart loop The daemon had a race condition where: 1. ensureDeaconRunning() starts a new Deacon session 2. checkDeaconHeartbeat() runs in same heartbeat cycle 3. Heartbeat file is stale (from before crash) 4. Session is immediately killed 5. Infinite restart loop every 3 minutes Fix: - Track when Deacon was last started (deaconLastStarted field) - Skip heartbeat check during 5-minute grace period - Add config support for Deacon (consistency with refinery/witness) After grace period, normal heartbeat checking resumes. Genuinely stuck sessions (no heartbeat update after 5+ min) are still detected. Fixes #589 --------- Co-authored-by: mayor <your-github-email@example.com>
This commit is contained in:
@@ -115,9 +115,8 @@ func (m *Manager) Start(foreground bool, agentOverride string) error {
|
||||
|
||||
if foreground {
|
||||
// In foreground mode, check tmux session (no PID inference per ZFC)
|
||||
townRoot := filepath.Dir(m.rig.Path)
|
||||
agentCfg := config.ResolveRoleAgentConfig(constants.RoleRefinery, townRoot, m.rig.Path)
|
||||
if running, _ := t.HasSession(sessionID); running && t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
|
||||
// Use IsClaudeRunning for robust detection (see gastown#566)
|
||||
if running, _ := t.HasSession(sessionID); running && t.IsClaudeRunning(sessionID) {
|
||||
return ErrAlreadyRunning
|
||||
}
|
||||
|
||||
@@ -138,14 +137,15 @@ func (m *Manager) Start(foreground bool, agentOverride string) error {
|
||||
// Background mode: check if session already exists
|
||||
running, _ := t.HasSession(sessionID)
|
||||
if running {
|
||||
// Session exists - check if agent is actually running (healthy vs zombie)
|
||||
townRoot := filepath.Dir(m.rig.Path)
|
||||
agentCfg := config.ResolveRoleAgentConfig(constants.RoleRefinery, townRoot, m.rig.Path)
|
||||
if t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
|
||||
// Healthy - agent is running
|
||||
// Session exists - check if Claude is actually running (healthy vs zombie)
|
||||
// Use IsClaudeRunning for robust detection: Claude can report as "node", "claude",
|
||||
// or version number like "2.0.76". IsAgentRunning with just "node" was too strict
|
||||
// and caused healthy sessions to be killed. See: gastown#566
|
||||
if t.IsClaudeRunning(sessionID) {
|
||||
// Healthy - Claude is running
|
||||
return ErrAlreadyRunning
|
||||
}
|
||||
// Zombie - tmux alive but agent dead. Kill and recreate.
|
||||
// Zombie - tmux alive but Claude dead. Kill and recreate.
|
||||
_, _ = fmt.Fprintln(m.output, "⚠ Detected zombie session (tmux alive, agent dead). Recreating...")
|
||||
if err := t.KillSession(sessionID); err != nil {
|
||||
return fmt.Errorf("killing zombie session: %w", err)
|
||||
|
||||
Reference in New Issue
Block a user