Add timeout fallback for dead agents (gt-2hzl4)
- Add checkStaleAgents() to detect agents reporting "running" but not updating - Add markAgentDead() to update agent bead state to "dead" - Integrate stale agent check into heartbeat cycle - DeadAgentTimeout set to 15 minutes This is a safety mechanism for agents that crash without updating their state. The daemon now marks them as dead so they can be restarted. Also fixes duplicate AgentFields declaration - now uses beads.go version with ParseAgentFieldsFromDescription alias in fields.go. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -3,103 +3,12 @@ package beads
|
||||
|
||||
import "strings"
|
||||
|
||||
// AgentFields holds parsed fields from an agent bead's description.
|
||||
// Agent beads store their state as key: value lines in the description.
|
||||
type AgentFields struct {
|
||||
RoleType string // role_type: mayor, deacon, witness, refinery, polecat
|
||||
Rig string // rig: gastown (or null)
|
||||
AgentState string // agent_state: idle, running, working, stopped
|
||||
HookBead string // hook_bead: the bead ID on the hook (or null)
|
||||
RoleBead string // role_bead: the role definition bead
|
||||
}
|
||||
// Note: AgentFields, ParseAgentFields, FormatAgentDescription, and CreateAgentBead are in beads.go
|
||||
|
||||
// ParseAgentFields extracts agent fields from an issue's description.
|
||||
// Fields are expected as "key: value" lines. Returns nil if no agent fields found.
|
||||
func ParseAgentFields(issue *Issue) *AgentFields {
|
||||
if issue == nil || issue.Description == "" {
|
||||
return nil
|
||||
}
|
||||
return ParseAgentFieldsFromDescription(issue.Description)
|
||||
}
|
||||
|
||||
// ParseAgentFieldsFromDescription extracts agent fields from a description string.
|
||||
// Returns nil if no agent fields found.
|
||||
// ParseAgentFieldsFromDescription is an alias for ParseAgentFields.
|
||||
// Used by daemon for compatibility.
|
||||
func ParseAgentFieldsFromDescription(description string) *AgentFields {
|
||||
if description == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
fields := &AgentFields{}
|
||||
hasFields := false
|
||||
|
||||
for _, line := range strings.Split(description, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
colonIdx := strings.Index(line, ":")
|
||||
if colonIdx == -1 {
|
||||
continue
|
||||
}
|
||||
|
||||
key := strings.TrimSpace(line[:colonIdx])
|
||||
value := strings.TrimSpace(line[colonIdx+1:])
|
||||
if value == "" || value == "null" {
|
||||
continue
|
||||
}
|
||||
|
||||
switch strings.ToLower(key) {
|
||||
case "role_type", "role-type", "roletype":
|
||||
fields.RoleType = value
|
||||
hasFields = true
|
||||
case "rig":
|
||||
fields.Rig = value
|
||||
hasFields = true
|
||||
case "agent_state", "agent-state", "agentstate":
|
||||
fields.AgentState = value
|
||||
hasFields = true
|
||||
case "hook_bead", "hook-bead", "hookbead":
|
||||
fields.HookBead = value
|
||||
hasFields = true
|
||||
case "role_bead", "role-bead", "rolebead":
|
||||
fields.RoleBead = value
|
||||
hasFields = true
|
||||
}
|
||||
}
|
||||
|
||||
if !hasFields {
|
||||
return nil
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
// FormatAgentFields formats AgentFields as a string suitable for an issue description.
|
||||
// Only non-empty fields are included.
|
||||
func FormatAgentFields(fields *AgentFields) string {
|
||||
if fields == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
var lines []string
|
||||
|
||||
if fields.RoleType != "" {
|
||||
lines = append(lines, "role_type: "+fields.RoleType)
|
||||
}
|
||||
if fields.Rig != "" {
|
||||
lines = append(lines, "rig: "+fields.Rig)
|
||||
}
|
||||
if fields.AgentState != "" {
|
||||
lines = append(lines, "agent_state: "+fields.AgentState)
|
||||
}
|
||||
if fields.HookBead != "" {
|
||||
lines = append(lines, "hook_bead: "+fields.HookBead)
|
||||
}
|
||||
if fields.RoleBead != "" {
|
||||
lines = append(lines, "role_bead: "+fields.RoleBead)
|
||||
}
|
||||
|
||||
return strings.Join(lines, "\n")
|
||||
return ParseAgentFields(description)
|
||||
}
|
||||
|
||||
// AttachmentFields holds the attachment info for pinned beads.
|
||||
|
||||
@@ -185,6 +185,10 @@ func (d *Daemon) heartbeat(state *State) {
|
||||
// 4. Process lifecycle requests
|
||||
d.processLifecycleRequests()
|
||||
|
||||
// 5. Check for stale agents (timeout fallback - gt-2hzl4)
|
||||
// Agents that report "running" but haven't updated in too long are marked dead
|
||||
d.checkStaleAgents()
|
||||
|
||||
// Update state
|
||||
state.LastHeartbeat = time.Now()
|
||||
state.HeartbeatCount++
|
||||
|
||||
@@ -577,6 +577,94 @@ func (d *Daemon) identityToAgentBeadID(identity string) string {
|
||||
}
|
||||
}
|
||||
|
||||
// DeadAgentTimeout is how long an agent can report "running" without updating
|
||||
// before the daemon marks it as dead. This is a fallback for crashed agents.
|
||||
const DeadAgentTimeout = 15 * time.Minute
|
||||
|
||||
// checkStaleAgents looks for agents that report state=running but haven't
|
||||
// updated their bead recently. These are likely dead agents that crashed
|
||||
// without updating their state. This is the timeout fallback per gt-2hzl4.
|
||||
func (d *Daemon) checkStaleAgents() {
|
||||
// Known agent bead IDs to check
|
||||
agentBeadIDs := []string{
|
||||
"gt-deacon",
|
||||
"gt-mayor",
|
||||
}
|
||||
|
||||
// Add rig-specific agents (witness, refinery) for known rigs
|
||||
// For now, we check gastown - could be expanded to discover rigs dynamically
|
||||
rigs := []string{"gastown", "beads"}
|
||||
for _, rig := range rigs {
|
||||
agentBeadIDs = append(agentBeadIDs, "gt-witness-"+rig)
|
||||
agentBeadIDs = append(agentBeadIDs, "gt-refinery-"+rig)
|
||||
}
|
||||
|
||||
for _, agentBeadID := range agentBeadIDs {
|
||||
info, err := d.getAgentBeadInfo(agentBeadID)
|
||||
if err != nil {
|
||||
// Agent bead doesn't exist or error fetching - skip
|
||||
continue
|
||||
}
|
||||
|
||||
// Only check agents reporting they're running/working
|
||||
if info.State != "running" && info.State != "working" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse the updated_at timestamp
|
||||
updatedAt, err := time.Parse(time.RFC3339, info.LastUpdate)
|
||||
if err != nil {
|
||||
d.logger.Printf("Warning: cannot parse updated_at for %s: %v", agentBeadID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if stale
|
||||
age := time.Since(updatedAt)
|
||||
if age > DeadAgentTimeout {
|
||||
d.logger.Printf("Agent %s appears dead (state=%s, last update %v ago, timeout %v)",
|
||||
agentBeadID, info.State, age.Round(time.Minute), DeadAgentTimeout)
|
||||
|
||||
// Mark as dead
|
||||
if err := d.markAgentDead(agentBeadID); err != nil {
|
||||
d.logger.Printf("Warning: failed to mark %s as dead: %v", agentBeadID, err)
|
||||
} else {
|
||||
d.logger.Printf("Marked agent %s as dead due to timeout", agentBeadID)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// markAgentDead updates an agent bead's state to "dead".
|
||||
// Uses bd update to modify the description with the new agent_state.
|
||||
func (d *Daemon) markAgentDead(agentBeadID string) error {
|
||||
// Get current agent info
|
||||
info, err := d.getAgentBeadInfo(agentBeadID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("fetching agent bead: %w", err)
|
||||
}
|
||||
|
||||
// Build new description with updated state
|
||||
newDesc := fmt.Sprintf("role_type: %s\nrig: %s\nagent_state: dead\nhook_bead: %s\nrole_bead: %s\n\nMarked dead by daemon at %s (was %s, last update too old)",
|
||||
info.RoleType,
|
||||
info.Rig,
|
||||
info.HookBead,
|
||||
info.RoleBead,
|
||||
time.Now().Format(time.RFC3339),
|
||||
info.State,
|
||||
)
|
||||
|
||||
// Use bd update to set the new description
|
||||
cmd := exec.Command("bd", "update", agentBeadID, "--description", newDesc)
|
||||
cmd.Dir = d.config.TownRoot
|
||||
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("bd update: %w (output: %s)", err, string(output))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// identityToBDActor converts a daemon identity (with dashes) to BD_ACTOR format (with slashes).
|
||||
// Examples:
|
||||
// - "mayor" → "mayor"
|
||||
|
||||
Reference in New Issue
Block a user