From 597c6b80710b1c0024a3bbb38caf93be9f3d8ff0 Mon Sep 17 00:00:00 2001 From: Steve Yegge Date: Sun, 28 Dec 2025 01:55:52 -0800 Subject: [PATCH] Add timeout fallback for dead agents (gt-2hzl4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add checkStaleAgents() to detect agents reporting "running" but not updating - Add markAgentDead() to update agent bead state to "dead" - Integrate stale agent check into heartbeat cycle - DeadAgentTimeout set to 15 minutes This is a safety mechanism for agents that crash without updating their state. The daemon now marks them as dead so they can be restarted. Also fixes duplicate AgentFields declaration - now uses beads.go version with ParseAgentFieldsFromDescription alias in fields.go. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal/beads/fields.go | 99 ++---------------------------------- internal/daemon/daemon.go | 4 ++ internal/daemon/lifecycle.go | 88 ++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 95 deletions(-) diff --git a/internal/beads/fields.go b/internal/beads/fields.go index 487a95ab..4274f589 100644 --- a/internal/beads/fields.go +++ b/internal/beads/fields.go @@ -3,103 +3,12 @@ package beads import "strings" -// AgentFields holds parsed fields from an agent bead's description. -// Agent beads store their state as key: value lines in the description. -type AgentFields struct { - RoleType string // role_type: mayor, deacon, witness, refinery, polecat - Rig string // rig: gastown (or null) - AgentState string // agent_state: idle, running, working, stopped - HookBead string // hook_bead: the bead ID on the hook (or null) - RoleBead string // role_bead: the role definition bead -} +// Note: AgentFields, ParseAgentFields, FormatAgentDescription, and CreateAgentBead are in beads.go -// ParseAgentFields extracts agent fields from an issue's description. -// Fields are expected as "key: value" lines. Returns nil if no agent fields found. -func ParseAgentFields(issue *Issue) *AgentFields { - if issue == nil || issue.Description == "" { - return nil - } - return ParseAgentFieldsFromDescription(issue.Description) -} - -// ParseAgentFieldsFromDescription extracts agent fields from a description string. -// Returns nil if no agent fields found. +// ParseAgentFieldsFromDescription is an alias for ParseAgentFields. +// Used by daemon for compatibility. func ParseAgentFieldsFromDescription(description string) *AgentFields { - if description == "" { - return nil - } - - fields := &AgentFields{} - hasFields := false - - for _, line := range strings.Split(description, "\n") { - line = strings.TrimSpace(line) - if line == "" { - continue - } - - colonIdx := strings.Index(line, ":") - if colonIdx == -1 { - continue - } - - key := strings.TrimSpace(line[:colonIdx]) - value := strings.TrimSpace(line[colonIdx+1:]) - if value == "" || value == "null" { - continue - } - - switch strings.ToLower(key) { - case "role_type", "role-type", "roletype": - fields.RoleType = value - hasFields = true - case "rig": - fields.Rig = value - hasFields = true - case "agent_state", "agent-state", "agentstate": - fields.AgentState = value - hasFields = true - case "hook_bead", "hook-bead", "hookbead": - fields.HookBead = value - hasFields = true - case "role_bead", "role-bead", "rolebead": - fields.RoleBead = value - hasFields = true - } - } - - if !hasFields { - return nil - } - return fields -} - -// FormatAgentFields formats AgentFields as a string suitable for an issue description. -// Only non-empty fields are included. -func FormatAgentFields(fields *AgentFields) string { - if fields == nil { - return "" - } - - var lines []string - - if fields.RoleType != "" { - lines = append(lines, "role_type: "+fields.RoleType) - } - if fields.Rig != "" { - lines = append(lines, "rig: "+fields.Rig) - } - if fields.AgentState != "" { - lines = append(lines, "agent_state: "+fields.AgentState) - } - if fields.HookBead != "" { - lines = append(lines, "hook_bead: "+fields.HookBead) - } - if fields.RoleBead != "" { - lines = append(lines, "role_bead: "+fields.RoleBead) - } - - return strings.Join(lines, "\n") + return ParseAgentFields(description) } // AttachmentFields holds the attachment info for pinned beads. diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index bdf7c1ab..a2409404 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -185,6 +185,10 @@ func (d *Daemon) heartbeat(state *State) { // 4. Process lifecycle requests d.processLifecycleRequests() + // 5. Check for stale agents (timeout fallback - gt-2hzl4) + // Agents that report "running" but haven't updated in too long are marked dead + d.checkStaleAgents() + // Update state state.LastHeartbeat = time.Now() state.HeartbeatCount++ diff --git a/internal/daemon/lifecycle.go b/internal/daemon/lifecycle.go index df86ead7..54cdbad1 100644 --- a/internal/daemon/lifecycle.go +++ b/internal/daemon/lifecycle.go @@ -577,6 +577,94 @@ func (d *Daemon) identityToAgentBeadID(identity string) string { } } +// DeadAgentTimeout is how long an agent can report "running" without updating +// before the daemon marks it as dead. This is a fallback for crashed agents. +const DeadAgentTimeout = 15 * time.Minute + +// checkStaleAgents looks for agents that report state=running but haven't +// updated their bead recently. These are likely dead agents that crashed +// without updating their state. This is the timeout fallback per gt-2hzl4. +func (d *Daemon) checkStaleAgents() { + // Known agent bead IDs to check + agentBeadIDs := []string{ + "gt-deacon", + "gt-mayor", + } + + // Add rig-specific agents (witness, refinery) for known rigs + // For now, we check gastown - could be expanded to discover rigs dynamically + rigs := []string{"gastown", "beads"} + for _, rig := range rigs { + agentBeadIDs = append(agentBeadIDs, "gt-witness-"+rig) + agentBeadIDs = append(agentBeadIDs, "gt-refinery-"+rig) + } + + for _, agentBeadID := range agentBeadIDs { + info, err := d.getAgentBeadInfo(agentBeadID) + if err != nil { + // Agent bead doesn't exist or error fetching - skip + continue + } + + // Only check agents reporting they're running/working + if info.State != "running" && info.State != "working" { + continue + } + + // Parse the updated_at timestamp + updatedAt, err := time.Parse(time.RFC3339, info.LastUpdate) + if err != nil { + d.logger.Printf("Warning: cannot parse updated_at for %s: %v", agentBeadID, err) + continue + } + + // Check if stale + age := time.Since(updatedAt) + if age > DeadAgentTimeout { + d.logger.Printf("Agent %s appears dead (state=%s, last update %v ago, timeout %v)", + agentBeadID, info.State, age.Round(time.Minute), DeadAgentTimeout) + + // Mark as dead + if err := d.markAgentDead(agentBeadID); err != nil { + d.logger.Printf("Warning: failed to mark %s as dead: %v", agentBeadID, err) + } else { + d.logger.Printf("Marked agent %s as dead due to timeout", agentBeadID) + } + } + } +} + +// markAgentDead updates an agent bead's state to "dead". +// Uses bd update to modify the description with the new agent_state. +func (d *Daemon) markAgentDead(agentBeadID string) error { + // Get current agent info + info, err := d.getAgentBeadInfo(agentBeadID) + if err != nil { + return fmt.Errorf("fetching agent bead: %w", err) + } + + // Build new description with updated state + newDesc := fmt.Sprintf("role_type: %s\nrig: %s\nagent_state: dead\nhook_bead: %s\nrole_bead: %s\n\nMarked dead by daemon at %s (was %s, last update too old)", + info.RoleType, + info.Rig, + info.HookBead, + info.RoleBead, + time.Now().Format(time.RFC3339), + info.State, + ) + + // Use bd update to set the new description + cmd := exec.Command("bd", "update", agentBeadID, "--description", newDesc) + cmd.Dir = d.config.TownRoot + + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("bd update: %w (output: %s)", err, string(output)) + } + + return nil +} + // identityToBDActor converts a daemon identity (with dashes) to BD_ACTOR format (with slashes). // Examples: // - "mayor" → "mayor"