From 87169a3fc75903e60fab806d1b7238e05c86ecdd Mon Sep 17 00:00:00 2001 From: gastown/crew/joe Date: Tue, 6 Jan 2026 20:42:11 -0800 Subject: [PATCH] fix: complete removal of agent_state observable tracking (gt-zecmc) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Additional cleanup from the agent_state refactoring: - Remove dead code: checkStaleAgents(), markAgentDead() in lifecycle.go - Remove dead code: reportAgentState(), getAgentFields() in prime.go - Update getAgentBeadState() comment to clarify non-observable states only - Update mol-witness-patrol.formula.toml to use tmux discovery - Update mol-polecat-lease.formula.toml to use POLECAT_DONE mail - Update docs/watchdog-chain.md to reflect new architecture 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/watchdog-chain.md | 22 ++-- internal/cmd/prime.go | 86 +------------- internal/daemon/lifecycle.go | 108 ++---------------- .../formulas/mol-polecat-lease.formula.toml | 8 +- .../formulas/mol-witness-patrol.formula.toml | 2 +- 5 files changed, 28 insertions(+), 198 deletions(-) diff --git a/docs/watchdog-chain.md b/docs/watchdog-chain.md index 70604723..fd3445af 100644 --- a/docs/watchdog-chain.md +++ b/docs/watchdog-chain.md @@ -82,11 +82,11 @@ The daemon runs a heartbeat tick every 3 minutes: func (d *Daemon) heartbeatTick() { d.ensureBootRunning() // 1. Spawn Boot for triage d.checkDeaconHeartbeat() // 2. Belt-and-suspenders fallback - d.ensureWitnessesRunning() // 3. Witness health - d.triggerPendingSpawns() // 4. Bootstrap polecats - d.processLifecycleRequests() // 5. Cycle/restart requests - d.checkStaleAgents() // 6. Timeout detection - // ... more checks + d.ensureWitnessesRunning() // 3. Witness health (checks tmux directly) + d.ensureRefineriesRunning() // 4. Refinery health (checks tmux directly) + d.triggerPendingSpawns() // 5. Bootstrap polecats + d.processLifecycleRequests() // 6. Cycle/restart requests + // Agent state derived from tmux, not recorded in beads (gt-zecmc) } ``` @@ -190,7 +190,7 @@ Multiple layers ensure recovery: 1. **Boot triage** - Intelligent observation, first line 2. **Daemon checkDeaconHeartbeat()** - Belt-and-suspenders if Boot fails -3. **Daemon checkStaleAgents()** - Timeout-based detection +3. **Tmux-based discovery** - Daemon checks tmux sessions directly (no bead state) 4. **Human escalation** - Mail to overseer for unrecoverable states ## State Files @@ -239,9 +239,11 @@ gt deacon health-check ### Status Shows Wrong State -**Symptom**: `gt status` shows "stopped" for running agents -**Cause**: Bead state and tmux state diverged -**Fix**: Reconcile with `gt sync-status` or restart agent +**Symptom**: `gt status` shows wrong state for agents +**Cause**: Previously bead state and tmux state could diverge +**Fix**: As of gt-zecmc, status derives state from tmux directly (no bead state for +observable conditions like running/stopped). Non-observable states (stuck, awaiting-gate) +are still stored in beads. ## Design Decision: Keep Separation @@ -284,7 +286,7 @@ The separation is correct; these bugs need fixing: 1. **Session confusion** (gt-sgzsb): Boot spawns in wrong session 2. **Zombie blocking** (gt-j1i0r): Daemon can't kill zombie sessions -3. **Status mismatch** (gt-doih4): Bead vs tmux state divergence +3. ~~**Status mismatch** (gt-doih4): Bead vs tmux state divergence~~ → FIXED in gt-zecmc 4. **Ensure semantics** (gt-ekc5u): Start should kill zombies first ## Summary diff --git a/internal/cmd/prime.go b/internal/cmd/prime.go index b4d93c0b..528371fd 100644 --- a/internal/cmd/prime.go +++ b/internal/cmd/prime.go @@ -1186,89 +1186,9 @@ func acquireIdentityLock(ctx RoleContext) error { return nil } -// reportAgentState updates the agent bead to report the agent's current state. -// This implements ZFC-compliant self-reporting of agent state. -// Agents call this on startup (running) and shutdown (stopped). -// For crew workers, creates the agent bead if it doesn't exist. -func reportAgentState(ctx RoleContext, state string) { - agentBeadID := getAgentBeadID(ctx) - if agentBeadID == "" { - return - } - - // Use the beads API directly to update agent state - // This is more reliable than shelling out to bd - bd := beads.New(ctx.WorkDir) - - // Check if agent bead exists, create if needed (especially for crew workers) - if _, err := bd.Show(agentBeadID); err != nil { - // Agent bead doesn't exist - create it - fields := getAgentFields(ctx, state) - if fields != nil { - _, createErr := bd.CreateAgentBead(agentBeadID, agentBeadID, fields) - if createErr != nil { - // Silently ignore - beads might not be configured - return - } - // Bead created with initial state, no need to update - return - } - } - - // Update existing agent bead state - if err := bd.UpdateAgentState(agentBeadID, state, nil); err != nil { - // Silently ignore errors - don't fail prime if state reporting fails - return - } -} - -// getAgentFields returns the AgentFields for creating a new agent bead. -func getAgentFields(ctx RoleContext, state string) *beads.AgentFields { - switch ctx.Role { - case RoleCrew: - return &beads.AgentFields{ - RoleType: "crew", - Rig: ctx.Rig, - AgentState: state, - RoleBead: beads.RoleBeadIDTown("crew"), - } - case RolePolecat: - return &beads.AgentFields{ - RoleType: "polecat", - Rig: ctx.Rig, - AgentState: state, - RoleBead: beads.RoleBeadIDTown("polecat"), - } - case RoleMayor: - return &beads.AgentFields{ - RoleType: "mayor", - AgentState: state, - RoleBead: beads.RoleBeadIDTown("mayor"), - } - case RoleDeacon: - return &beads.AgentFields{ - RoleType: "deacon", - AgentState: state, - RoleBead: beads.RoleBeadIDTown("deacon"), - } - case RoleWitness: - return &beads.AgentFields{ - RoleType: "witness", - Rig: ctx.Rig, - AgentState: state, - RoleBead: beads.RoleBeadIDTown("witness"), - } - case RoleRefinery: - return &beads.AgentFields{ - RoleType: "refinery", - Rig: ctx.Rig, - AgentState: state, - RoleBead: beads.RoleBeadIDTown("refinery"), - } - default: - return nil - } -} +// NOTE: reportAgentState() and getAgentFields() were removed in gt-zecmc. +// Agent liveness is now discovered from tmux, not recorded in beads. +// "Discover, don't track" principle: observable state should not be recorded. // getAgentBeadID returns the agent bead ID for the current role. // Town-level agents (mayor, deacon) use hq- prefix; rig-scoped agents use the rig's prefix. diff --git a/internal/daemon/lifecycle.go b/internal/daemon/lifecycle.go index 698605d1..6fcd8b35 100644 --- a/internal/daemon/lifecycle.go +++ b/internal/daemon/lifecycle.go @@ -564,9 +564,10 @@ type AgentBeadInfo struct { LastUpdate string `json:"updated_at"` } -// getAgentBeadState reads agent state from an agent bead. -// This is the ZFC-compliant way to get agent state: trust what agents report. -// Returns the agent_state field value (idle|running|stuck|stopped) or empty string if not found. +// getAgentBeadState reads non-observable agent state from an agent bead. +// Per gt-zecmc: Observable states (running, dead, idle) are derived from tmux. +// Only non-observable states (stuck, awaiting-gate, muted, paused) are stored in beads. +// Returns the agent_state field value or empty string if not found. func (d *Daemon) getAgentBeadState(agentBeadID string) (string, error) { info, err := d.getAgentBeadInfo(agentBeadID) if err != nil { @@ -661,104 +662,9 @@ func (d *Daemon) identityToAgentBeadID(identity string) string { } } -// DeadAgentTimeout is how long an agent can report "running" without updating -// before the daemon marks it as dead. This is a fallback for crashed agents. -const DeadAgentTimeout = 15 * time.Minute - -// checkStaleAgents looks for agents that report state=running but haven't -// updated their bead recently. These are likely dead agents that crashed -// without updating their state. This is the timeout fallback per gt-2hzl4. -func (d *Daemon) checkStaleAgents() { - // Known agent bead IDs to check - agentBeadIDs := []string{ - beads.DeaconBeadIDTown(), - beads.MayorBeadIDTown(), - } - - // Dynamically discover rigs from the rigs config - rigsConfigPath := filepath.Join(d.config.TownRoot, "mayor", "rigs.json") - rigsConfig, err := config.LoadRigsConfig(rigsConfigPath) - if err != nil { - // Log warning but continue with global agents only - d.logger.Printf("Warning: could not load rigs config: %v", err) - } else { - // Add rig-specific agents (witness, refinery) for each discovered rig - for rigName, rigEntry := range rigsConfig.Rigs { - // Get rig prefix from config (defaults to "gt" if not set) - prefix := "gt" - if rigEntry.BeadsConfig != nil && rigEntry.BeadsConfig.Prefix != "" { - prefix = strings.TrimSuffix(rigEntry.BeadsConfig.Prefix, "-") - } - agentBeadIDs = append(agentBeadIDs, beads.WitnessBeadIDWithPrefix(prefix, rigName)) - agentBeadIDs = append(agentBeadIDs, beads.RefineryBeadIDWithPrefix(prefix, rigName)) - } - } - - for _, agentBeadID := range agentBeadIDs { - info, err := d.getAgentBeadInfo(agentBeadID) - if err != nil { - // Agent bead doesn't exist or error fetching - skip - continue - } - - // Only check agents reporting they're running/working - if info.State != "running" && info.State != "working" { - continue - } - - // Parse the updated_at timestamp - updatedAt, err := time.Parse(time.RFC3339, info.LastUpdate) - if err != nil { - d.logger.Printf("Warning: cannot parse updated_at for %s: %v", agentBeadID, err) - continue - } - - // Check if stale - age := time.Since(updatedAt) - if age > DeadAgentTimeout { - d.logger.Printf("Agent %s appears dead (state=%s, last update %v ago, timeout %v)", - agentBeadID, info.State, age.Round(time.Minute), DeadAgentTimeout) - - // Mark as dead - if err := d.markAgentDead(agentBeadID); err != nil { - d.logger.Printf("Warning: failed to mark %s as dead: %v", agentBeadID, err) - } else { - d.logger.Printf("Marked agent %s as dead due to timeout", agentBeadID) - } - } - } -} - -// markAgentDead updates an agent bead's state to "dead". -// Uses bd update to modify the description with the new agent_state. -func (d *Daemon) markAgentDead(agentBeadID string) error { - // Get current agent info - info, err := d.getAgentBeadInfo(agentBeadID) - if err != nil { - return fmt.Errorf("fetching agent bead: %w", err) - } - - // Build new description with updated state - newDesc := fmt.Sprintf("role_type: %s\nrig: %s\nagent_state: dead\nhook_bead: %s\nrole_bead: %s\n\nMarked dead by daemon at %s (was %s, last update too old)", - info.RoleType, - info.Rig, - info.HookBead, - info.RoleBead, - time.Now().Format(time.RFC3339), - info.State, - ) - - // Use bd update to set the new description - cmd := exec.Command("bd", "update", agentBeadID, "--description", newDesc) - cmd.Dir = d.config.TownRoot - - output, err := cmd.CombinedOutput() - if err != nil { - return fmt.Errorf("bd update: %w (output: %s)", err, string(output)) - } - - return nil -} +// NOTE: checkStaleAgents() and markAgentDead() were removed in gt-zecmc. +// Agent liveness is now discovered from tmux, not recorded in beads. +// "Discover, don't track" principle: observable state should not be recorded. // identityToBDActor converts a daemon identity to BD_ACTOR format (with slashes). // Uses parseIdentity to extract components, then builds the slash format. diff --git a/internal/formula/formulas/mol-polecat-lease.formula.toml b/internal/formula/formulas/mol-polecat-lease.formula.toml index 75acd76c..1120808c 100644 --- a/internal/formula/formulas/mol-polecat-lease.formula.toml +++ b/internal/formula/formulas/mol-polecat-lease.formula.toml @@ -62,7 +62,8 @@ Polecat is actively working. Monitor for stuck or completion. **Periodic checks:** - Use standard nudge protocol from Witness CLAUDE.md -- Watch for POLECAT_DONE mail or agent_state=done +- Watch for POLECAT_DONE mail (primary completion signal) +- Check tmux session: `gt session status {{rig}}/polecats/{{polecat}}` **Signs of progress:** - Git commits appearing @@ -73,11 +74,12 @@ Polecat is actively working. Monitor for stuck or completion. - Idle >15 minutes - Repeated errors - Explicit "I'm stuck" messages +- Agent bead shows stuck state: `bd show ` -**If POLECAT_DONE received or agent_state=done:** +**If POLECAT_DONE mail received:** Proceed to verifying step. -**Exit criteria:** Polecat signals completion (POLECAT_DONE mail or state=done).""" +**Exit criteria:** Polecat signals completion via POLECAT_DONE mail.""" [[steps]] id = "verifying" diff --git a/internal/formula/formulas/mol-witness-patrol.formula.toml b/internal/formula/formulas/mol-witness-patrol.formula.toml index 12c612b9..2c8dca1a 100644 --- a/internal/formula/formulas/mol-witness-patrol.formula.toml +++ b/internal/formula/formulas/mol-witness-patrol.formula.toml @@ -20,7 +20,7 @@ needs = ['process-cleanups'] title = 'Ensure refinery is alive' [[steps]] -description = "Survey all polecats using agent beads (ZFC: trust what agents report).\n\n**Step 1: List polecat agent beads**\n\n```bash\nbd list --type=agent --json\n```\n\nFilter the JSON output for entries where description contains `role_type: polecat`.\nEach polecat agent bead has fields in its description:\n- `role_type: polecat`\n- `rig: `\n- `agent_state: running|idle|stuck|done`\n- `hook_bead: `\n\n**Step 2: For each polecat, check agent_state**\n\n| agent_state | Meaning | Action |\n|-------------|---------|--------|\n| running | Actively working | Check progress (Step 3) |\n| idle | No work assigned | Auto-nuke if clean (Step 3a) |\n| stuck | Self-reported stuck | Handle stuck protocol |\n| done | Work complete | Verify cleanup triggered (see Step 4a) |\n\n**Step 3: For running polecats, assess progress**\n\nCheck the hook_bead field to see what they're working on:\n```bash\nbd show # See current step/issue\n```\n\nYou can also verify they're responsive:\n```bash\ntmux capture-pane -t gt-- -p | tail -20\n```\n\nLook for:\n- Recent tool activity → making progress\n- Idle at prompt → may need nudge\n- Error messages → may need help\n\n**Step 3a: For idle polecats, auto-nuke if clean**\n\nWhen agent_state=idle, the polecat has no work assigned. Check if it's safe to nuke:\n\n```bash\n# Check git status in the polecat's worktree\ncd polecats/\ngit status --porcelain # Should be empty (clean)\ngit log origin/main..HEAD # Should have no unpushed commits\n```\n\n**If clean** (no uncommitted changes, no unpushed commits):\n```bash\n# Safe to nuke - no work to lose\ngt polecat nuke \n```\nLog the auto-nuke for audit purposes. No escalation needed.\n\n**If dirty** (uncommitted or unpushed work):\n```bash\n# Escalate to Mayor - polecat has work that might be valuable\ngt mail send mayor/ -s \\\"IDLE_DIRTY: has uncommitted work\\\" \\\n -m \\\"Polecat: \nState: idle (no hook_bead)\nGit status: \nUnpushed commits: \n\nPlease advise: recover work or discard?\\\"\n```\n\n**Rationale**: Idle polecats with clean git state are pure overhead. They have\nno work and no state worth preserving. Nuking them immediately frees resources\nand reduces noise. Only escalate when there's actual work at risk.\n\n**Step 4: Decide action**\n\n| Observation | Action |\n|-------------|--------|\n| agent_state=running, recent activity | None |\n| agent_state=running, idle 5-15 min | Gentle nudge |\n| agent_state=running, idle 15+ min | Direct nudge with deadline |\n| agent_state=stuck | Assess and help or escalate |\n| agent_state=done | Verify cleanup triggered (see Step 4a) |\n\n**Step 4a: Handle agent_state=done**\n\nIn the ephemeral model, polecats with agent_state=done and cleanup_status=clean\nshould already be nuked by HandlePolecatDone. Finding one here indicates:\n\n1. **Stale agent bead** - polecat was nuked but bead remains\n ```bash\n # Verify polecat doesn't exist anymore\n ls polecats/ 2>/dev/null || echo \"Already nuked\"\n ```\n If nuked, the agent bead is stale. Clean it up or ignore.\n\n2. **Cleanup wisp exists** - polecat has dirty state needing intervention\n ```bash\n bd list --wisp --labels=polecat: --status=open\n ```\n Process in process-cleanups step.\n\n3. **No wisp, polecat exists** - POLECAT_DONE mail was missed\n Try auto-nuke directly (ephemeral model):\n ```bash\n # Check cleanup_status and nuke if clean\n gt polecat nuke # Will fail if dirty\n ```\n If nuke fails (dirty state), create cleanup wisp for investigation.\n\n**Step 5: Execute nudges**\n```bash\ngt nudge /polecats/ \"How's progress? Need help?\"\n```\n\n**Step 6: Escalate if needed**\n```bash\ngt mail send mayor/ -s \"Escalation: stuck\" \\\n -m \"Polecat reports stuck. Please intervene.\"\n```\n\n**Parallelism**: Use Task tool subagents to inspect multiple polecats concurrently.\n\n**ZFC Principle**: Trust agent_state from beads. Don't infer state from PID/tmux." +description = "Survey all polecats using tmux (discover, don't track).\n\n**Principle**: Agent liveness is discovered from tmux, not recorded in beads.\nOnly non-observable states like 'stuck' are stored in beads.\n\n**Step 1: List polecat sessions**\n\n```bash\n# Find all polecat tmux sessions for this rig\ntmux list-sessions -F '#{session_name}' 2>/dev/null | grep \"^gt--\"\n```\n\nFor each session, check if Claude is actively running:\n```bash\n# Check if Claude process is running in the session\ngt session status /polecats/\n```\n\n**Step 2: For each polecat, assess state**\n\n| Observation | Meaning | Action |\n|-------------|---------|--------|\n| Session exists, Claude running | Actively working | Check progress (Step 3) |\n| Session exists, Claude not running | Zombie session | Kill and respawn if work hooked |\n| No session | Not running | Check if should exist |\n\nAlso check the agent bead for non-observable state:\n```bash\nbd show # Check for stuck, awaiting-gate, etc.\n```\n\n**Step 3: For running polecats, assess progress**\n\nCheck hook_bead to see what they're working on:\n```bash\nbd show # Look at hook_bead field\n```\n\nCapture pane to assess activity:\n```bash\ntmux capture-pane -t gt-- -p | tail -20\n```\n\nLook for:\n- Recent tool activity → making progress\n- Idle at prompt → may need nudge\n- Error messages → may need help\n\n**Step 3a: For idle polecats (no hook_bead), auto-nuke if clean**\n\nWhen a polecat has no hook_bead, it has no assigned work. Check if safe to nuke:\n\n```bash\n# Check git status in the polecat's worktree\ncd polecats/\ngit status --porcelain # Should be empty (clean)\ngit log origin/main..HEAD # Should have no unpushed commits\n```\n\n**If clean** (no uncommitted changes, no unpushed commits):\n```bash\n# Safe to nuke - no work to lose\ngt polecat nuke \n```\nLog the auto-nuke for audit purposes. No escalation needed.\n\n**If dirty** (uncommitted or unpushed work):\n```bash\n# Escalate to Mayor - polecat has work that might be valuable\ngt mail send mayor/ -s \\\"IDLE_DIRTY: has uncommitted work\\\" \\\n -m \\\"Polecat: \nState: no work hooked\nGit status: \nUnpushed commits: \n\nPlease advise: recover work or discard?\\\"\n```\n\n**Rationale**: Idle polecats with clean git state are pure overhead. They have\nno work and no state worth preserving. Nuking them immediately frees resources\nand reduces noise. Only escalate when there's actual work at risk.\n\n**Step 4: Decide action**\n\n| Observation | Action |\n|-------------|--------|\n| Running, recent activity | None |\n| Running, idle 5-15 min | Gentle nudge |\n| Running, idle 15+ min | Direct nudge with deadline |\n| Bead shows stuck | Assess and help or escalate |\n| POLECAT_DONE mail received | Verify cleanup (see Step 4a) |\n\n**Step 4a: Handle POLECAT_DONE**\n\nIn the ephemeral model, POLECAT_DONE triggers immediate cleanup:\n\n1. **Check cleanup_status** in agent bead\n2. **If clean**: Auto-nuke immediately\n ```bash\n gt polecat nuke \n ```\n3. **If dirty**: Create cleanup wisp for manual intervention\n\n**Step 5: Execute nudges**\n```bash\ngt nudge /polecats/ \"How's progress? Need help?\"\n```\n\n**Step 6: Escalate if needed**\n```bash\ngt mail send mayor/ -s \"Escalation: stuck\" \\\n -m \"Polecat reports stuck. Please intervene.\"\n```\n\n**Parallelism**: Use Task tool subagents to inspect multiple polecats concurrently.\n\n**Discovery Principle**: Derive state from tmux. Only trust bead state for\nnon-observable conditions like 'stuck' or 'awaiting-gate'." id = 'survey-workers' needs = ['check-refinery'] title = 'Inspect all active polecats'