Add timeout fallback for dead agents (gt-2hzl4)

- Add checkStaleAgents() to detect agents reporting "running" but not updating
- Add markAgentDead() to update agent bead state to "dead"
- Integrate stale agent check into heartbeat cycle
- DeadAgentTimeout set to 15 minutes

This is a safety mechanism for agents that crash without updating their state.
The daemon now marks them as dead so they can be restarted.

Also fixes duplicate AgentFields declaration - now uses beads.go version with
ParseAgentFieldsFromDescription alias in fields.go.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-12-28 01:55:52 -08:00
parent a1715fa91f
commit 597c6b8071
3 changed files with 96 additions and 95 deletions

View File

@@ -3,103 +3,12 @@ package beads
import "strings"
// AgentFields holds parsed fields from an agent bead's description.
// Agent beads store their state as key: value lines in the description.
type AgentFields struct {
RoleType string // role_type: mayor, deacon, witness, refinery, polecat
Rig string // rig: gastown (or null)
AgentState string // agent_state: idle, running, working, stopped
HookBead string // hook_bead: the bead ID on the hook (or null)
RoleBead string // role_bead: the role definition bead
}
// Note: AgentFields, ParseAgentFields, FormatAgentDescription, and CreateAgentBead are in beads.go
// ParseAgentFields extracts agent fields from an issue's description.
// Fields are expected as "key: value" lines. Returns nil if no agent fields found.
func ParseAgentFields(issue *Issue) *AgentFields {
if issue == nil || issue.Description == "" {
return nil
}
return ParseAgentFieldsFromDescription(issue.Description)
}
// ParseAgentFieldsFromDescription extracts agent fields from a description string.
// Returns nil if no agent fields found.
// ParseAgentFieldsFromDescription is an alias for ParseAgentFields.
// Used by daemon for compatibility.
func ParseAgentFieldsFromDescription(description string) *AgentFields {
if description == "" {
return nil
}
fields := &AgentFields{}
hasFields := false
for _, line := range strings.Split(description, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
colonIdx := strings.Index(line, ":")
if colonIdx == -1 {
continue
}
key := strings.TrimSpace(line[:colonIdx])
value := strings.TrimSpace(line[colonIdx+1:])
if value == "" || value == "null" {
continue
}
switch strings.ToLower(key) {
case "role_type", "role-type", "roletype":
fields.RoleType = value
hasFields = true
case "rig":
fields.Rig = value
hasFields = true
case "agent_state", "agent-state", "agentstate":
fields.AgentState = value
hasFields = true
case "hook_bead", "hook-bead", "hookbead":
fields.HookBead = value
hasFields = true
case "role_bead", "role-bead", "rolebead":
fields.RoleBead = value
hasFields = true
}
}
if !hasFields {
return nil
}
return fields
}
// FormatAgentFields formats AgentFields as a string suitable for an issue description.
// Only non-empty fields are included.
func FormatAgentFields(fields *AgentFields) string {
if fields == nil {
return ""
}
var lines []string
if fields.RoleType != "" {
lines = append(lines, "role_type: "+fields.RoleType)
}
if fields.Rig != "" {
lines = append(lines, "rig: "+fields.Rig)
}
if fields.AgentState != "" {
lines = append(lines, "agent_state: "+fields.AgentState)
}
if fields.HookBead != "" {
lines = append(lines, "hook_bead: "+fields.HookBead)
}
if fields.RoleBead != "" {
lines = append(lines, "role_bead: "+fields.RoleBead)
}
return strings.Join(lines, "\n")
return ParseAgentFields(description)
}
// AttachmentFields holds the attachment info for pinned beads.

View File

@@ -185,6 +185,10 @@ func (d *Daemon) heartbeat(state *State) {
// 4. Process lifecycle requests
d.processLifecycleRequests()
// 5. Check for stale agents (timeout fallback - gt-2hzl4)
// Agents that report "running" but haven't updated in too long are marked dead
d.checkStaleAgents()
// Update state
state.LastHeartbeat = time.Now()
state.HeartbeatCount++

View File

@@ -577,6 +577,94 @@ func (d *Daemon) identityToAgentBeadID(identity string) string {
}
}
// DeadAgentTimeout is how long an agent can report "running" without updating
// before the daemon marks it as dead. This is a fallback for crashed agents.
const DeadAgentTimeout = 15 * time.Minute
// checkStaleAgents looks for agents that report state=running but haven't
// updated their bead recently. These are likely dead agents that crashed
// without updating their state. This is the timeout fallback per gt-2hzl4.
func (d *Daemon) checkStaleAgents() {
// Known agent bead IDs to check
agentBeadIDs := []string{
"gt-deacon",
"gt-mayor",
}
// Add rig-specific agents (witness, refinery) for known rigs
// For now, we check gastown - could be expanded to discover rigs dynamically
rigs := []string{"gastown", "beads"}
for _, rig := range rigs {
agentBeadIDs = append(agentBeadIDs, "gt-witness-"+rig)
agentBeadIDs = append(agentBeadIDs, "gt-refinery-"+rig)
}
for _, agentBeadID := range agentBeadIDs {
info, err := d.getAgentBeadInfo(agentBeadID)
if err != nil {
// Agent bead doesn't exist or error fetching - skip
continue
}
// Only check agents reporting they're running/working
if info.State != "running" && info.State != "working" {
continue
}
// Parse the updated_at timestamp
updatedAt, err := time.Parse(time.RFC3339, info.LastUpdate)
if err != nil {
d.logger.Printf("Warning: cannot parse updated_at for %s: %v", agentBeadID, err)
continue
}
// Check if stale
age := time.Since(updatedAt)
if age > DeadAgentTimeout {
d.logger.Printf("Agent %s appears dead (state=%s, last update %v ago, timeout %v)",
agentBeadID, info.State, age.Round(time.Minute), DeadAgentTimeout)
// Mark as dead
if err := d.markAgentDead(agentBeadID); err != nil {
d.logger.Printf("Warning: failed to mark %s as dead: %v", agentBeadID, err)
} else {
d.logger.Printf("Marked agent %s as dead due to timeout", agentBeadID)
}
}
}
}
// markAgentDead updates an agent bead's state to "dead".
// Uses bd update to modify the description with the new agent_state.
func (d *Daemon) markAgentDead(agentBeadID string) error {
// Get current agent info
info, err := d.getAgentBeadInfo(agentBeadID)
if err != nil {
return fmt.Errorf("fetching agent bead: %w", err)
}
// Build new description with updated state
newDesc := fmt.Sprintf("role_type: %s\nrig: %s\nagent_state: dead\nhook_bead: %s\nrole_bead: %s\n\nMarked dead by daemon at %s (was %s, last update too old)",
info.RoleType,
info.Rig,
info.HookBead,
info.RoleBead,
time.Now().Format(time.RFC3339),
info.State,
)
// Use bd update to set the new description
cmd := exec.Command("bd", "update", agentBeadID, "--description", newDesc)
cmd.Dir = d.config.TownRoot
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("bd update: %w (output: %s)", err, string(output))
}
return nil
}
// identityToBDActor converts a daemon identity (with dashes) to BD_ACTOR format (with slashes).
// Examples:
// - "mayor" → "mayor"