feat(daemon): Add circuit breaker for stuck agents (gt-72cqu)
- Add Refinery monitoring to daemon heartbeat (ensureRefineriesRunning) - Add circuit breaker: agents marked dead by checkStaleAgents() are now force-killed and restarted, even if Claude process is alive - Fixes zombie Claude sessions that werent updating their bead state The circuit breaker flow: 1. Agent gets stuck (stops updating bead state) 2. After 15 minutes: checkStaleAgents() marks bead as dead 3. On next heartbeat: ensure*Running() sees state=dead 4. Force-kill session and restart fresh Generated with Claude Code Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
9992
.beads/issues.jsonl
9992
.beads/issues.jsonl
File diff suppressed because one or more lines are too long
@@ -1 +1 @@
|
|||||||
gt-9bm3j
|
gt-gastown-polecat-warboy
|
||||||
|
|||||||
@@ -158,6 +158,9 @@ func (d *Daemon) heartbeat(state *State) {
|
|||||||
// 2. Ensure Witnesses are running for all rigs (restart if dead)
|
// 2. Ensure Witnesses are running for all rigs (restart if dead)
|
||||||
d.ensureWitnessesRunning()
|
d.ensureWitnessesRunning()
|
||||||
|
|
||||||
|
// 2b. Ensure Refineries are running for all rigs (restart if dead)
|
||||||
|
d.ensureRefineriesRunning()
|
||||||
|
|
||||||
// 3. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
|
// 3. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
|
||||||
// This ensures polecats get nudged even when Deacon isn't in a patrol cycle.
|
// This ensures polecats get nudged even when Deacon isn't in a patrol cycle.
|
||||||
// Uses regex-based WaitForClaudeReady, which is acceptable for daemon bootstrap.
|
// Uses regex-based WaitForClaudeReady, which is acceptable for daemon bootstrap.
|
||||||
@@ -282,17 +285,35 @@ func (d *Daemon) ensureDeaconRunning() {
|
|||||||
// Timeout fallback for stale state is in lifecycle.go
|
// Timeout fallback for stale state is in lifecycle.go
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CIRCUIT BREAKER: If agent is marked "dead" by checkStaleAgents(),
|
||||||
|
// force-kill the session and restart. This handles stuck agents that
|
||||||
|
// are still alive (zombie Claude sessions that haven't updated their bead).
|
||||||
|
if beadState == "dead" {
|
||||||
|
d.logger.Println("Deacon is marked dead (circuit breaker triggered), forcing restart...")
|
||||||
|
deaconSession := d.getDeaconSessionName()
|
||||||
|
hasSession, _ := d.tmux.HasSession(deaconSession)
|
||||||
|
if hasSession {
|
||||||
|
if err := d.tmux.KillSession(deaconSession); err != nil {
|
||||||
|
d.logger.Printf("Warning: failed to kill dead Deacon session: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fall through to restart
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Agent bead check failed or state is not running.
|
// Agent bead check failed or state is not running/working.
|
||||||
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
|
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
|
||||||
// This prevents killing healthy sessions when bead state is stale or unreadable.
|
// This prevents killing healthy sessions when bead state is stale or unreadable.
|
||||||
deaconSession := d.getDeaconSessionName()
|
// Skip this check if agent was marked dead (we already handled that above).
|
||||||
hasSession, sessionErr := d.tmux.HasSession(deaconSession)
|
if beadState != "dead" {
|
||||||
if sessionErr == nil && hasSession {
|
deaconSession := d.getDeaconSessionName()
|
||||||
if d.tmux.IsClaudeRunning(deaconSession) {
|
hasSession, sessionErr := d.tmux.HasSession(deaconSession)
|
||||||
d.logger.Println("Deacon session healthy (Claude running), skipping restart despite stale bead")
|
if sessionErr == nil && hasSession {
|
||||||
return
|
if d.tmux.IsClaudeRunning(deaconSession) {
|
||||||
|
d.logger.Println("Deacon session healthy (Claude running), skipping restart despite stale bead")
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -395,19 +416,36 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
|||||||
// Agent reports it's running - trust it
|
// Agent reports it's running - trust it
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CIRCUIT BREAKER: If agent is marked "dead" by checkStaleAgents(),
|
||||||
|
// force-kill the session and restart. This handles stuck agents that
|
||||||
|
// are still alive (zombie Claude sessions that haven't updated their bead).
|
||||||
|
if beadState == "dead" {
|
||||||
|
d.logger.Printf("Witness for %s is marked dead (circuit breaker triggered), forcing restart...", rigName)
|
||||||
|
hasSession, _ := d.tmux.HasSession(sessionName)
|
||||||
|
if hasSession {
|
||||||
|
if err := d.tmux.KillSession(sessionName); err != nil {
|
||||||
|
d.logger.Printf("Warning: failed to kill dead witness session for %s: %v", rigName, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fall through to restart
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Agent bead check failed or state is not running.
|
// Agent bead check failed or state is not running/working.
|
||||||
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
|
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
|
||||||
// This prevents killing healthy sessions when bead state is stale or unreadable.
|
// This prevents killing healthy sessions when bead state is stale or unreadable.
|
||||||
hasSession, sessionErr := d.tmux.HasSession(sessionName)
|
// Skip this check if agent was marked dead (we already handled that above).
|
||||||
if sessionErr == nil && hasSession {
|
if beadState != "dead" {
|
||||||
// Session exists - check if Claude is actually running in it
|
hasSession, sessionErr := d.tmux.HasSession(sessionName)
|
||||||
if d.tmux.IsClaudeRunning(sessionName) {
|
if sessionErr == nil && hasSession {
|
||||||
// Session is healthy - don't restart it
|
// Session exists - check if Claude is actually running in it
|
||||||
// The bead state may be stale; agent will update it on next activity
|
if d.tmux.IsClaudeRunning(sessionName) {
|
||||||
d.logger.Printf("Witness for %s session healthy (Claude running), skipping restart despite stale bead", rigName)
|
// Session is healthy - don't restart it
|
||||||
return
|
// The bead state may be stale; agent will update it on next activity
|
||||||
|
d.logger.Printf("Witness for %s session healthy (Claude running), skipping restart despite stale bead", rigName)
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -443,6 +481,114 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
|||||||
d.logger.Printf("Witness session for %s started successfully", rigName)
|
d.logger.Printf("Witness session for %s started successfully", rigName)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ensureRefineriesRunning ensures refineries are running for all rigs.
|
||||||
|
// Called on each heartbeat to maintain refinery merge queue processing.
|
||||||
|
func (d *Daemon) ensureRefineriesRunning() {
|
||||||
|
rigs := d.getKnownRigs()
|
||||||
|
for _, rigName := range rigs {
|
||||||
|
d.ensureRefineryRunning(rigName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ensureRefineryRunning ensures the refinery for a specific rig is running.
|
||||||
|
func (d *Daemon) ensureRefineryRunning(rigName string) {
|
||||||
|
agentID := beads.RefineryBeadID(rigName)
|
||||||
|
sessionName := "gt-" + rigName + "-refinery"
|
||||||
|
|
||||||
|
// Check agent bead state (ZFC: trust what agent reports)
|
||||||
|
beadState, beadErr := d.getAgentBeadState(agentID)
|
||||||
|
if beadErr == nil {
|
||||||
|
if beadState == "running" || beadState == "working" {
|
||||||
|
// Agent reports it's running - trust it
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// CIRCUIT BREAKER: If agent is marked "dead" by checkStaleAgents(),
|
||||||
|
// force-kill the session and restart. This handles stuck agents that
|
||||||
|
// are still alive (zombie Claude sessions that haven't updated their bead).
|
||||||
|
if beadState == "dead" {
|
||||||
|
d.logger.Printf("Refinery for %s is marked dead (circuit breaker triggered), forcing restart...", rigName)
|
||||||
|
hasSession, _ := d.tmux.HasSession(sessionName)
|
||||||
|
if hasSession {
|
||||||
|
if err := d.tmux.KillSession(sessionName); err != nil {
|
||||||
|
d.logger.Printf("Warning: failed to kill dead refinery session for %s: %v", rigName, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fall through to restart
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Agent bead check failed or state is not running/working.
|
||||||
|
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
|
||||||
|
// This prevents killing healthy sessions when bead state is stale or unreadable.
|
||||||
|
// Skip this check if agent was marked dead (we already handled that above).
|
||||||
|
if beadState != "dead" {
|
||||||
|
hasSession, sessionErr := d.tmux.HasSession(sessionName)
|
||||||
|
if sessionErr == nil && hasSession {
|
||||||
|
// Session exists - check if Claude is actually running in it
|
||||||
|
if d.tmux.IsClaudeRunning(sessionName) {
|
||||||
|
// Session is healthy - don't restart it
|
||||||
|
// The bead state may be stale; agent will update it on next activity
|
||||||
|
d.logger.Printf("Refinery for %s session healthy (Claude running), skipping restart despite stale bead", rigName)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Agent not running (or bead not found) AND session is not healthy - start it
|
||||||
|
d.logger.Printf("Refinery for %s not running per agent bead, starting...", rigName)
|
||||||
|
|
||||||
|
// Determine working directory
|
||||||
|
rigPath := filepath.Join(d.config.TownRoot, rigName)
|
||||||
|
refineryDir := filepath.Join(rigPath, "refinery", "rig")
|
||||||
|
if _, err := os.Stat(refineryDir); os.IsNotExist(err) {
|
||||||
|
// Fall back to rig path if refinery/rig doesn't exist
|
||||||
|
refineryDir = rigPath
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create session in refinery directory
|
||||||
|
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||||
|
if err := d.tmux.EnsureSessionFresh(sessionName, refineryDir); err != nil {
|
||||||
|
d.logger.Printf("Error creating refinery session for %s: %v", rigName, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set environment
|
||||||
|
bdActor := fmt.Sprintf("%s/refinery", rigName)
|
||||||
|
_ = d.tmux.SetEnvironment(sessionName, "GT_ROLE", "refinery")
|
||||||
|
_ = d.tmux.SetEnvironment(sessionName, "GT_RIG", rigName)
|
||||||
|
_ = d.tmux.SetEnvironment(sessionName, "BD_ACTOR", bdActor)
|
||||||
|
|
||||||
|
// Set beads environment
|
||||||
|
beadsDir := filepath.Join(rigPath, "mayor", "rig", ".beads")
|
||||||
|
_ = d.tmux.SetEnvironment(sessionName, "BEADS_DIR", beadsDir)
|
||||||
|
_ = d.tmux.SetEnvironment(sessionName, "BEADS_NO_DAEMON", "1")
|
||||||
|
_ = d.tmux.SetEnvironment(sessionName, "BEADS_AGENT_NAME", bdActor)
|
||||||
|
|
||||||
|
// Apply theming (non-fatal)
|
||||||
|
theme := tmux.AssignTheme(rigName)
|
||||||
|
_ = d.tmux.ConfigureGasTownSession(sessionName, theme, rigName, "refinery", "refinery")
|
||||||
|
|
||||||
|
// Launch Claude with environment exported inline
|
||||||
|
envVars := map[string]string{
|
||||||
|
"GT_ROLE": "refinery",
|
||||||
|
"GT_RIG": rigName,
|
||||||
|
"BD_ACTOR": bdActor,
|
||||||
|
"GIT_AUTHOR_NAME": bdActor,
|
||||||
|
}
|
||||||
|
if err := d.tmux.SendKeys(sessionName, config.BuildStartupCommand(envVars, "", "")); err != nil {
|
||||||
|
d.logger.Printf("Error launching Claude in refinery session for %s: %v", rigName, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for Claude to start, then accept bypass permissions warning if it appears.
|
||||||
|
if err := d.tmux.WaitForCommand(sessionName, constants.SupportedShells, constants.ClaudeStartTimeout); err != nil {
|
||||||
|
// Non-fatal - Claude might still start
|
||||||
|
}
|
||||||
|
_ = d.tmux.AcceptBypassPermissionsWarning(sessionName)
|
||||||
|
|
||||||
|
d.logger.Printf("Refinery session for %s started successfully", rigName)
|
||||||
|
}
|
||||||
|
|
||||||
// getKnownRigs returns list of registered rig names.
|
// getKnownRigs returns list of registered rig names.
|
||||||
func (d *Daemon) getKnownRigs() []string {
|
func (d *Daemon) getKnownRigs() []string {
|
||||||
|
|||||||
Reference in New Issue
Block a user