fix: remove observable states from agent_state (discover, don't track)
The agent_state field was recording observable state like "running", "dead", "idle" which violated the "Discover, Don't Track" principle. This caused stale state bugs where agents were marked "dead" in beads but actually running in tmux. Changes: - Remove daemon's checkStaleAgents() which marked agents "dead" - Simplify ensureXxxRunning() to use tmux.IsClaudeRunning() directly - Remove reportAgentState() calls from gt prime and gt handoff - Add SetHookBead/ClearHookBead helpers that don't update agent_state - Use ClearHookBead in gt done and gt unsling - Simplify gt status to derive state from tmux, not bead Non-observable states (stuck, awaiting-gate, muted, paused) are still set because they represent intentional agent decisions that can't be discovered from tmux state. Fixes: gt-zecmc 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
950e35317e
commit
1f44482ad0
@@ -357,6 +357,13 @@ func (b *Beads) run(args ...string) ([]byte, error) {
|
||||
return stdout.Bytes(), nil
|
||||
}
|
||||
|
||||
// Run executes a bd command and returns stdout.
|
||||
// This is a public wrapper around the internal run method for cases where
|
||||
// callers need to run arbitrary bd commands.
|
||||
func (b *Beads) Run(args ...string) ([]byte, error) {
|
||||
return b.run(args...)
|
||||
}
|
||||
|
||||
// wrapError wraps bd errors with context.
|
||||
func (b *Beads) wrapError(err error, stderr string, args []string) error {
|
||||
stderr = strings.TrimSpace(stderr)
|
||||
@@ -1144,6 +1151,38 @@ func (b *Beads) UpdateAgentState(id string, state string, hookBead *string) erro
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetHookBead sets the hook_bead slot on an agent bead.
|
||||
// This is a convenience wrapper that only sets the hook without changing agent_state.
|
||||
// Per gt-zecmc: agent_state ("running", "dead", "idle") is observable from tmux
|
||||
// and should not be recorded in beads ("discover, don't track" principle).
|
||||
func (b *Beads) SetHookBead(agentBeadID, hookBeadID string) error {
|
||||
// Set the hook using bd slot set
|
||||
// This updates the hook_bead column directly in SQLite
|
||||
_, err := b.run("slot", "set", agentBeadID, "hook", hookBeadID)
|
||||
if err != nil {
|
||||
// If slot is already occupied, clear it first then retry
|
||||
errStr := err.Error()
|
||||
if strings.Contains(errStr, "already occupied") {
|
||||
_, _ = b.run("slot", "clear", agentBeadID, "hook")
|
||||
_, err = b.run("slot", "set", agentBeadID, "hook", hookBeadID)
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("setting hook: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ClearHookBead clears the hook_bead slot on an agent bead.
|
||||
// Used when work is complete or unslung.
|
||||
func (b *Beads) ClearHookBead(agentBeadID string) error {
|
||||
_, err := b.run("slot", "clear", agentBeadID, "hook")
|
||||
if err != nil {
|
||||
return fmt.Errorf("clearing hook: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateAgentCleanupStatus updates the cleanup_status field in an agent bead.
|
||||
// This is called by the polecat to self-report its git state (ZFC compliance).
|
||||
// Valid statuses: clean, has_uncommitted, has_stash, has_unpushed
|
||||
|
||||
@@ -356,12 +356,10 @@ func runDone(cmd *cobra.Command, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateAgentStateOnDone updates the agent bead state when work is complete.
|
||||
// Maps exit type to agent state:
|
||||
// - COMPLETED → "done"
|
||||
// - ESCALATED → "stuck"
|
||||
// - DEFERRED → "idle"
|
||||
// - PHASE_COMPLETE → "awaiting-gate"
|
||||
// updateAgentStateOnDone clears the agent's hook and reports cleanup status.
|
||||
// Per gt-zecmc: observable states ("done", "idle") removed - use tmux to discover.
|
||||
// Non-observable states ("stuck", "awaiting-gate") are still set since they represent
|
||||
// intentional agent decisions that can't be observed from tmux.
|
||||
//
|
||||
// Also self-reports cleanup_status for ZFC compliance (#10).
|
||||
func updateAgentStateOnDone(cwd, townRoot, exitType, _ string) { // issueID unused but kept for future audit logging
|
||||
@@ -384,22 +382,6 @@ func updateAgentStateOnDone(cwd, townRoot, exitType, _ string) { // issueID unus
|
||||
return
|
||||
}
|
||||
|
||||
// Map exit type to agent state
|
||||
var newState string
|
||||
switch exitType {
|
||||
case ExitCompleted:
|
||||
newState = "done"
|
||||
case ExitEscalated:
|
||||
newState = "stuck"
|
||||
case ExitDeferred:
|
||||
newState = "idle"
|
||||
case ExitPhaseComplete:
|
||||
newState = "awaiting-gate"
|
||||
default:
|
||||
return
|
||||
}
|
||||
|
||||
// Update agent bead with new state and clear hook_bead (work is done)
|
||||
// Use rig path for slot commands - bd slot doesn't route from town root
|
||||
var beadsPath string
|
||||
switch ctx.Role {
|
||||
@@ -423,11 +405,26 @@ func updateAgentStateOnDone(cwd, townRoot, exitType, _ string) { // issueID unus
|
||||
}
|
||||
}
|
||||
|
||||
emptyHook := ""
|
||||
if err := bd.UpdateAgentState(agentBeadID, newState, &emptyHook); err != nil {
|
||||
// Log warning instead of silent ignore - helps debug cross-beads issues
|
||||
fmt.Fprintf(os.Stderr, "Warning: couldn't update agent %s state on done: %v\n", agentBeadID, err)
|
||||
return
|
||||
// Clear the hook (work is done) - gt-zecmc
|
||||
if err := bd.ClearHookBead(agentBeadID); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Warning: couldn't clear agent %s hook: %v\n", agentBeadID, err)
|
||||
}
|
||||
|
||||
// Only set non-observable states - "stuck" and "awaiting-gate" are intentional
|
||||
// agent decisions that can't be discovered from tmux. Skip "done" and "idle"
|
||||
// since those are observable (no session = done, session + no hook = idle).
|
||||
switch exitType {
|
||||
case ExitEscalated:
|
||||
// "stuck" = agent is requesting help - not observable from tmux
|
||||
if _, err := bd.Run("agent", "state", agentBeadID, "stuck"); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Warning: couldn't set agent %s to stuck: %v\n", agentBeadID, err)
|
||||
}
|
||||
case ExitPhaseComplete:
|
||||
// "awaiting-gate" = agent is waiting for external trigger - not observable
|
||||
if _, err := bd.Run("agent", "state", agentBeadID, "awaiting-gate"); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Warning: couldn't set agent %s to awaiting-gate: %v\n", agentBeadID, err)
|
||||
}
|
||||
// ExitCompleted and ExitDeferred don't set state - observable from tmux
|
||||
}
|
||||
|
||||
// ZFC #10: Self-report cleanup status
|
||||
|
||||
@@ -182,19 +182,9 @@ func runHandoff(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Report agent state as stopped (ZFC: agents self-report state)
|
||||
cwd, _ := os.Getwd()
|
||||
if townRoot, _ := workspace.FindFromCwd(); townRoot != "" {
|
||||
if roleInfo, err := GetRoleWithContext(cwd, townRoot); err == nil {
|
||||
reportAgentState(RoleContext{
|
||||
Role: roleInfo.Role,
|
||||
Rig: roleInfo.Rig,
|
||||
Polecat: roleInfo.Polecat,
|
||||
TownRoot: townRoot,
|
||||
WorkDir: cwd,
|
||||
}, "stopped")
|
||||
}
|
||||
}
|
||||
// NOTE: reportAgentState("stopped") removed (gt-zecmc)
|
||||
// Agent liveness is observable from tmux - no need to record it in bead.
|
||||
// "Discover, don't track" principle: reality is truth, state is derived.
|
||||
|
||||
// Clear scrollback history before respawn (resets copy-mode from [0/N] to [0/0])
|
||||
if err := t.ClearHistory(pane); err != nil {
|
||||
|
||||
@@ -149,8 +149,9 @@ func runPrime(cmd *cobra.Command, args []string) error {
|
||||
// Ensure beads redirect exists for worktree-based roles
|
||||
ensureBeadsRedirect(ctx)
|
||||
|
||||
// Report agent state as running (ZFC: agents self-report state)
|
||||
reportAgentState(ctx, "running")
|
||||
// NOTE: reportAgentState("running") removed (gt-zecmc)
|
||||
// Agent liveness is observable from tmux - no need to record it in bead.
|
||||
// "Discover, don't track" principle: reality is truth, state is derived.
|
||||
|
||||
// Emit session_start event for seance discovery
|
||||
emitSessionEvent(ctx)
|
||||
|
||||
@@ -1033,13 +1033,13 @@ func updateAgentHookBead(agentID, beadID, workDir, townBeadsDir string) {
|
||||
}
|
||||
|
||||
// Run from workDir WITHOUT BEADS_DIR to enable redirect-based routing.
|
||||
// Update agent_state to "running" and set hook_bead to the slung work.
|
||||
// For same-database beads, the hook slot is set via `bd slot set`.
|
||||
// Set hook_bead to the slung work (gt-zecmc: removed agent_state update).
|
||||
// Agent liveness is observable from tmux - no need to record it in bead.
|
||||
// For cross-database scenarios, slot set may fail gracefully (warning only).
|
||||
bd := beads.New(bdWorkDir)
|
||||
if err := bd.UpdateAgentState(agentBeadID, "running", &beadID); err != nil {
|
||||
if err := bd.SetHookBead(agentBeadID, beadID); err != nil {
|
||||
// Log warning instead of silent ignore - helps debug cross-beads issues
|
||||
fmt.Fprintf(os.Stderr, "Warning: couldn't update agent %s state: %v\n", agentBeadID, err)
|
||||
fmt.Fprintf(os.Stderr, "Warning: couldn't set agent %s hook: %v\n", agentBeadID, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
@@ -586,40 +586,34 @@ func outputStatusText(status TownStatus) error {
|
||||
// renderAgentDetails renders full agent bead details
|
||||
func renderAgentDetails(agent AgentRuntime, indent string, hooks []AgentHookInfo, townRoot string) { //nolint:unparam // indent kept for future customization
|
||||
// Line 1: Agent bead ID + status
|
||||
// Reconcile bead state with tmux session state to surface mismatches
|
||||
// States: "running" (active), "idle" (waiting), "stopped", "dead", etc.
|
||||
beadState := agent.State
|
||||
// Per gt-zecmc: derive status from tmux (observable reality), not bead state.
|
||||
// "Discover, don't track" - agent liveness is observable from tmux session.
|
||||
sessionExists := agent.Running
|
||||
|
||||
// "idle" is a normal operational state (running but waiting for work)
|
||||
// Treat it the same as "running" for reconciliation purposes
|
||||
beadSaysRunning := beadState == "running" || beadState == "idle" || beadState == ""
|
||||
|
||||
var statusStr string
|
||||
var stateInfo string
|
||||
|
||||
switch {
|
||||
case beadSaysRunning && sessionExists:
|
||||
// Normal running state - session exists and bead agrees
|
||||
if sessionExists {
|
||||
statusStr = style.Success.Render("running")
|
||||
case beadSaysRunning && !sessionExists:
|
||||
// Bead thinks running but session is gone - stale bead state
|
||||
statusStr = style.Error.Render("running")
|
||||
stateInfo = style.Warning.Render(" [dead]")
|
||||
case !beadSaysRunning && sessionExists:
|
||||
// Session exists but bead says stopped/dead - mismatch!
|
||||
// This is the key case: tmux says alive, bead says dead/stopped
|
||||
statusStr = style.Success.Render("running")
|
||||
stateInfo = style.Warning.Render(" [bead: " + beadState + "]")
|
||||
default:
|
||||
// Both agree: stopped
|
||||
} else {
|
||||
statusStr = style.Error.Render("stopped")
|
||||
}
|
||||
|
||||
// Add agent state info if not already shown and state is interesting
|
||||
// Skip "idle" and "running" as they're normal operational states
|
||||
if stateInfo == "" && beadState != "" && beadState != "idle" && beadState != "running" {
|
||||
// Show non-observable states that represent intentional agent decisions.
|
||||
// These can't be discovered from tmux and are legitimately recorded in beads.
|
||||
beadState := agent.State
|
||||
switch beadState {
|
||||
case "stuck":
|
||||
// Agent escalated - needs help
|
||||
stateInfo = style.Warning.Render(" [stuck]")
|
||||
case "awaiting-gate":
|
||||
// Agent waiting for external trigger (phase gate)
|
||||
stateInfo = style.Dim.Render(" [awaiting-gate]")
|
||||
case "muted", "paused", "degraded":
|
||||
// Other intentional non-observable states
|
||||
stateInfo = style.Dim.Render(fmt.Sprintf(" [%s]", beadState))
|
||||
// Ignore observable states: "running", "idle", "dead", "done", "stopped", ""
|
||||
// These should be derived from tmux, not bead.
|
||||
}
|
||||
|
||||
// Build agent bead ID using canonical naming: prefix-rig-role-name
|
||||
@@ -741,22 +735,8 @@ func formatMQSummaryCompact(mq *MQSummary) string {
|
||||
|
||||
// renderAgentCompactWithSuffix renders a single-line agent status with an extra suffix
|
||||
func renderAgentCompactWithSuffix(agent AgentRuntime, indent string, hooks []AgentHookInfo, townRoot string, suffix string) {
|
||||
// Build status indicator
|
||||
var statusIndicator string
|
||||
beadState := agent.State
|
||||
sessionExists := agent.Running
|
||||
beadSaysRunning := beadState == "running" || beadState == "idle" || beadState == ""
|
||||
|
||||
switch {
|
||||
case beadSaysRunning && sessionExists:
|
||||
statusIndicator = style.Success.Render("●")
|
||||
case beadSaysRunning && !sessionExists:
|
||||
statusIndicator = style.Error.Render("●") + style.Warning.Render(" dead")
|
||||
case !beadSaysRunning && sessionExists:
|
||||
statusIndicator = style.Success.Render("●") + style.Warning.Render(" ["+beadState+"]")
|
||||
default:
|
||||
statusIndicator = style.Error.Render("○")
|
||||
}
|
||||
// Build status indicator (gt-zecmc: use tmux state, not bead state)
|
||||
statusIndicator := buildStatusIndicator(agent)
|
||||
|
||||
// Get hook info
|
||||
hookBead := agent.HookBead
|
||||
@@ -795,22 +775,8 @@ func renderAgentCompactWithSuffix(agent AgentRuntime, indent string, hooks []Age
|
||||
|
||||
// renderAgentCompact renders a single-line agent status
|
||||
func renderAgentCompact(agent AgentRuntime, indent string, hooks []AgentHookInfo, townRoot string) {
|
||||
// Build status indicator
|
||||
var statusIndicator string
|
||||
beadState := agent.State
|
||||
sessionExists := agent.Running
|
||||
beadSaysRunning := beadState == "running" || beadState == "idle" || beadState == ""
|
||||
|
||||
switch {
|
||||
case beadSaysRunning && sessionExists:
|
||||
statusIndicator = style.Success.Render("●")
|
||||
case beadSaysRunning && !sessionExists:
|
||||
statusIndicator = style.Error.Render("●") + style.Warning.Render(" dead")
|
||||
case !beadSaysRunning && sessionExists:
|
||||
statusIndicator = style.Success.Render("●") + style.Warning.Render(" ["+beadState+"]")
|
||||
default:
|
||||
statusIndicator = style.Error.Render("○")
|
||||
}
|
||||
// Build status indicator (gt-zecmc: use tmux state, not bead state)
|
||||
statusIndicator := buildStatusIndicator(agent)
|
||||
|
||||
// Get hook info
|
||||
hookBead := agent.HookBead
|
||||
@@ -847,6 +813,35 @@ func renderAgentCompact(agent AgentRuntime, indent string, hooks []AgentHookInfo
|
||||
fmt.Printf("%s%-12s %s%s%s\n", indent, agent.Name, statusIndicator, hookSuffix, mailSuffix)
|
||||
}
|
||||
|
||||
// buildStatusIndicator creates the visual status indicator for an agent.
|
||||
// Per gt-zecmc: uses tmux state (observable reality), not bead state.
|
||||
// Non-observable states (stuck, awaiting-gate, muted, etc.) are shown as suffixes.
|
||||
func buildStatusIndicator(agent AgentRuntime) string {
|
||||
sessionExists := agent.Running
|
||||
|
||||
// Base indicator from tmux state
|
||||
var indicator string
|
||||
if sessionExists {
|
||||
indicator = style.Success.Render("●")
|
||||
} else {
|
||||
indicator = style.Error.Render("○")
|
||||
}
|
||||
|
||||
// Add non-observable state suffix if present
|
||||
beadState := agent.State
|
||||
switch beadState {
|
||||
case "stuck":
|
||||
indicator += style.Warning.Render(" stuck")
|
||||
case "awaiting-gate":
|
||||
indicator += style.Dim.Render(" gate")
|
||||
case "muted", "paused", "degraded":
|
||||
indicator += style.Dim.Render(" " + beadState)
|
||||
// Ignore observable states: running, idle, dead, done, stopped, ""
|
||||
}
|
||||
|
||||
return indicator
|
||||
}
|
||||
|
||||
// formatHookInfo formats the hook bead and title for display
|
||||
func formatHookInfo(hookBead, title string, maxLen int) string {
|
||||
if hookBead == "" {
|
||||
|
||||
@@ -162,9 +162,8 @@ func runUnsling(cmd *cobra.Command, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Clear the hook by updating agent bead with empty hook_bead
|
||||
emptyHook := ""
|
||||
if err := b.UpdateAgentState(agentBeadID, "running", &emptyHook); err != nil {
|
||||
// Clear the hook (gt-zecmc: removed agent_state update - observable from tmux)
|
||||
if err := b.ClearHookBead(agentBeadID); err != nil {
|
||||
return fmt.Errorf("clearing hook from agent bead %s: %w", agentBeadID, err)
|
||||
}
|
||||
|
||||
|
||||
@@ -189,9 +189,10 @@ func (d *Daemon) heartbeat(state *State) {
|
||||
// 4. Process lifecycle requests
|
||||
d.processLifecycleRequests()
|
||||
|
||||
// 5. Check for stale agents (timeout fallback)
|
||||
// Agents that report "running" but haven't updated in too long are marked dead
|
||||
d.checkStaleAgents()
|
||||
// 5. Stale agent check REMOVED (gt-zecmc)
|
||||
// Was: d.checkStaleAgents() - marked agents "dead" based on bead update time.
|
||||
// This violated "discover, don't track" - agent liveness is observable from tmux.
|
||||
// The daemon now checks tmux directly in ensureXxxRunning() functions.
|
||||
|
||||
// 6. Check for GUPP violations (agents with work-on-hook not progressing)
|
||||
d.checkGUPPViolations()
|
||||
@@ -288,58 +289,28 @@ func (d *Daemon) runDegradedBootTriage(b *boot.Boot) {
|
||||
}
|
||||
|
||||
// ensureDeaconRunning ensures the Deacon is running.
|
||||
// ZFC-compliant: trusts agent bead state, with tmux health check fallback.
|
||||
// Discover, don't track: checks tmux directly instead of bead state (gt-zecmc).
|
||||
// The Deacon is the system's heartbeat - it must always be running.
|
||||
func (d *Daemon) ensureDeaconRunning() {
|
||||
// Check agent bead state (ZFC: trust what agent reports)
|
||||
beadState, beadErr := d.getAgentBeadState(d.getDeaconSessionName())
|
||||
if beadErr == nil {
|
||||
if beadState == "running" || beadState == "working" {
|
||||
// Agent reports it's running - trust it
|
||||
// Timeout fallback for stale state is in lifecycle.go
|
||||
deaconSession := d.getDeaconSessionName()
|
||||
|
||||
// Check if tmux session exists and Claude is running (observable reality)
|
||||
hasSession, sessionErr := d.tmux.HasSession(deaconSession)
|
||||
if sessionErr == nil && hasSession {
|
||||
if d.tmux.IsClaudeRunning(deaconSession) {
|
||||
// Deacon is running - nothing to do
|
||||
return
|
||||
}
|
||||
|
||||
// CIRCUIT BREAKER: If agent is marked "dead" by checkStaleAgents(),
|
||||
// force-kill the session and restart. This handles stuck agents that
|
||||
// are still alive (zombie Claude sessions that haven't updated their bead).
|
||||
if beadState == "dead" {
|
||||
d.logger.Println("Deacon is marked dead (circuit breaker triggered), forcing restart...")
|
||||
deaconSession := d.getDeaconSessionName()
|
||||
hasSession, _ := d.tmux.HasSession(deaconSession)
|
||||
if hasSession {
|
||||
if err := d.tmux.KillSession(deaconSession); err != nil {
|
||||
d.logger.Printf("Warning: failed to kill dead Deacon session: %v", err)
|
||||
}
|
||||
}
|
||||
// Fall through to restart
|
||||
// Session exists but Claude not running - zombie session, kill it
|
||||
d.logger.Println("Deacon session exists but Claude not running, killing zombie session...")
|
||||
if err := d.tmux.KillSession(deaconSession); err != nil {
|
||||
d.logger.Printf("Warning: failed to kill zombie Deacon session: %v", err)
|
||||
}
|
||||
// Fall through to restart
|
||||
}
|
||||
|
||||
// Agent bead check failed or state is not running/working.
|
||||
// FALLBACK: Check if tmux session is actually healthy before attempting restart.
|
||||
// This prevents killing healthy sessions when bead state is stale or unreadable.
|
||||
// Skip this check if agent was marked dead (we already handled that above).
|
||||
if beadState != "dead" {
|
||||
deaconSession := d.getDeaconSessionName()
|
||||
hasSession, sessionErr := d.tmux.HasSession(deaconSession)
|
||||
if sessionErr == nil && hasSession {
|
||||
if d.tmux.IsClaudeRunning(deaconSession) {
|
||||
// STATE DIVERGENCE: tmux shows running but bead disagrees.
|
||||
// Don't kill (safety), but nudge the agent to reconcile its state.
|
||||
// This prevents silent state drift where bead and reality diverge.
|
||||
d.logger.Printf("STATE DIVERGENCE: Deacon bead='%s' but Claude is running in tmux", beadState)
|
||||
nudgeMsg := "[DAEMON] State divergence detected: your agent bead shows '" + beadState + "' but you appear running. Please run: bd agent state " + deaconSession + " running"
|
||||
if err := d.tmux.NudgeSession(deaconSession, nudgeMsg); err != nil {
|
||||
d.logger.Printf("Warning: failed to nudge Deacon about state divergence: %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Agent not running (or bead not found) AND session is not healthy - start it
|
||||
d.logger.Println("Deacon not running per agent bead, starting...")
|
||||
// Deacon not running - start it
|
||||
d.logger.Println("Deacon not running, starting...")
|
||||
|
||||
// Create session in deacon directory (ensures correct CLAUDE.md is loaded)
|
||||
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||
@@ -426,39 +397,11 @@ func (d *Daemon) ensureWitnessesRunning() {
|
||||
}
|
||||
|
||||
// ensureWitnessRunning ensures the witness for a specific rig is running.
|
||||
// Discover, don't track: uses Manager.Start() which checks tmux directly (gt-zecmc).
|
||||
func (d *Daemon) ensureWitnessRunning(rigName string) {
|
||||
prefix := config.GetRigPrefix(d.config.TownRoot, rigName)
|
||||
agentID := beads.WitnessBeadIDWithPrefix(prefix, rigName)
|
||||
sessionName := "gt-" + rigName + "-witness"
|
||||
|
||||
// Check agent bead state (ZFC: trust what agent reports)
|
||||
beadState, beadErr := d.getAgentBeadState(agentID)
|
||||
if beadErr == nil {
|
||||
if beadState == "running" || beadState == "working" {
|
||||
// Agent reports it's running - trust it
|
||||
return
|
||||
}
|
||||
|
||||
// CIRCUIT BREAKER: If agent is marked "dead" by checkStaleAgents(),
|
||||
// force-kill the session and restart. This handles stuck agents that
|
||||
// are still alive (zombie Claude sessions that haven't updated their bead).
|
||||
if beadState == "dead" {
|
||||
d.logger.Printf("Witness for %s is marked dead (circuit breaker triggered), forcing restart...", rigName)
|
||||
hasSession, _ := d.tmux.HasSession(sessionName)
|
||||
if hasSession {
|
||||
if err := d.tmux.KillSession(sessionName); err != nil {
|
||||
d.logger.Printf("Warning: failed to kill dead witness session for %s: %v", rigName, err)
|
||||
}
|
||||
}
|
||||
// Fall through to restart
|
||||
}
|
||||
}
|
||||
|
||||
// Agent not running (or bead not found) - use Manager.Start() for unified startup
|
||||
// Manager.Start() handles: zombie detection, session creation, env vars, theming,
|
||||
// WaitForClaudeReady, and crucially - startup/propulsion nudges (GUPP)
|
||||
d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName)
|
||||
|
||||
// WaitForClaudeReady, and crucially - startup/propulsion nudges (GUPP).
|
||||
// It returns ErrAlreadyRunning if Claude is already running in tmux.
|
||||
r := &rig.Rig{
|
||||
Name: rigName,
|
||||
Path: filepath.Join(d.config.TownRoot, rigName),
|
||||
@@ -467,20 +410,14 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
||||
|
||||
if err := mgr.Start(false); err != nil {
|
||||
if err == witness.ErrAlreadyRunning {
|
||||
// STATE DIVERGENCE: tmux shows running but bead disagrees.
|
||||
// Don't kill (safety), but nudge the agent to reconcile its state.
|
||||
d.logger.Printf("STATE DIVERGENCE: Witness for %s bead='%s' but Claude is running in tmux", rigName, beadState)
|
||||
nudgeMsg := "[DAEMON] State divergence detected: your agent bead shows '" + beadState + "' but you appear running. Please run: bd agent state " + agentID + " running"
|
||||
if err := d.tmux.NudgeSession(sessionName, nudgeMsg); err != nil {
|
||||
d.logger.Printf("Warning: failed to nudge Witness %s about state divergence: %v", rigName, err)
|
||||
}
|
||||
// Already running - nothing to do
|
||||
return
|
||||
}
|
||||
d.logger.Printf("Error starting witness for %s: %v", rigName, err)
|
||||
return
|
||||
}
|
||||
|
||||
d.logger.Printf("Witness session for %s started successfully (with nudges)", rigName)
|
||||
d.logger.Printf("Witness session for %s started successfully", rigName)
|
||||
}
|
||||
|
||||
// ensureRefineriesRunning ensures refineries are running for all rigs.
|
||||
@@ -493,39 +430,11 @@ func (d *Daemon) ensureRefineriesRunning() {
|
||||
}
|
||||
|
||||
// ensureRefineryRunning ensures the refinery for a specific rig is running.
|
||||
// Discover, don't track: uses Manager.Start() which checks tmux directly (gt-zecmc).
|
||||
func (d *Daemon) ensureRefineryRunning(rigName string) {
|
||||
prefix := config.GetRigPrefix(d.config.TownRoot, rigName)
|
||||
agentID := beads.RefineryBeadIDWithPrefix(prefix, rigName)
|
||||
sessionName := "gt-" + rigName + "-refinery"
|
||||
|
||||
// Check agent bead state (ZFC: trust what agent reports)
|
||||
beadState, beadErr := d.getAgentBeadState(agentID)
|
||||
if beadErr == nil {
|
||||
if beadState == "running" || beadState == "working" {
|
||||
// Agent reports it's running - trust it
|
||||
return
|
||||
}
|
||||
|
||||
// CIRCUIT BREAKER: If agent is marked "dead" by checkStaleAgents(),
|
||||
// force-kill the session and restart. This handles stuck agents that
|
||||
// are still alive (zombie Claude sessions that haven't updated their bead).
|
||||
if beadState == "dead" {
|
||||
d.logger.Printf("Refinery for %s is marked dead (circuit breaker triggered), forcing restart...", rigName)
|
||||
hasSession, _ := d.tmux.HasSession(sessionName)
|
||||
if hasSession {
|
||||
if err := d.tmux.KillSession(sessionName); err != nil {
|
||||
d.logger.Printf("Warning: failed to kill dead refinery session for %s: %v", rigName, err)
|
||||
}
|
||||
}
|
||||
// Fall through to restart
|
||||
}
|
||||
}
|
||||
|
||||
// Agent not running (or bead not found) - use Manager.Start() for unified startup
|
||||
// Manager.Start() handles: zombie detection, session creation, env vars, theming,
|
||||
// WaitForClaudeReady, and crucially - startup/propulsion nudges (GUPP)
|
||||
d.logger.Printf("Refinery for %s not running per agent bead, starting...", rigName)
|
||||
|
||||
// WaitForClaudeReady, and crucially - startup/propulsion nudges (GUPP).
|
||||
// It returns ErrAlreadyRunning if Claude is already running in tmux.
|
||||
r := &rig.Rig{
|
||||
Name: rigName,
|
||||
Path: filepath.Join(d.config.TownRoot, rigName),
|
||||
@@ -534,20 +443,14 @@ func (d *Daemon) ensureRefineryRunning(rigName string) {
|
||||
|
||||
if err := mgr.Start(false); err != nil {
|
||||
if err == refinery.ErrAlreadyRunning {
|
||||
// STATE DIVERGENCE: tmux shows running but bead disagrees.
|
||||
// Don't kill (safety), but nudge the agent to reconcile its state.
|
||||
d.logger.Printf("STATE DIVERGENCE: Refinery for %s bead='%s' but Claude is running in tmux", rigName, beadState)
|
||||
nudgeMsg := "[DAEMON] State divergence detected: your agent bead shows '" + beadState + "' but you appear running. Please run: bd agent state " + agentID + " running"
|
||||
if err := d.tmux.NudgeSession(sessionName, nudgeMsg); err != nil {
|
||||
d.logger.Printf("Warning: failed to nudge Refinery %s about state divergence: %v", rigName, err)
|
||||
}
|
||||
// Already running - nothing to do
|
||||
return
|
||||
}
|
||||
d.logger.Printf("Error starting refinery for %s: %v", rigName, err)
|
||||
return
|
||||
}
|
||||
|
||||
d.logger.Printf("Refinery session for %s started successfully (with nudges)", rigName)
|
||||
d.logger.Printf("Refinery session for %s started successfully", rigName)
|
||||
}
|
||||
|
||||
// getKnownRigs returns list of registered rig names.
|
||||
|
||||
Reference in New Issue
Block a user