Per ZFC principle: 'Let agents decide thresholds. Stuck is a judgment call.' Changes: - Add health check threshold fields to RoleConfig (ping_timeout, consecutive_failures, kill_cooldown, stuck_threshold) - Add LoadStuckConfig() to read thresholds from hq-deacon-role bead - Update patrol_check.go to use configurable stuck threshold - Defaults remain as fallbacks when no role bead config exists Agents can now configure their stuck detection by adding fields to their role bead, e.g.: ping_timeout: 45s consecutive_failures: 5 kill_cooldown: 10m stuck_threshold: 2h Fixes: hq-2355b
233 lines
7.4 KiB
Go
233 lines
7.4 KiB
Go
// Package deacon provides the Deacon agent infrastructure.
|
|
package deacon
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"time"
|
|
|
|
"github.com/steveyegge/gastown/internal/beads"
|
|
)
|
|
|
|
// Default parameters for stuck-session detection.
|
|
// These are fallbacks when no role bead config exists.
|
|
// Per ZFC: "Let agents decide thresholds. 'Stuck' is a judgment call."
|
|
const (
|
|
DefaultPingTimeout = 30 * time.Second // How long to wait for response
|
|
DefaultConsecutiveFailures = 3 // Failures before force-kill
|
|
DefaultCooldown = 5 * time.Minute // Minimum time between force-kills
|
|
)
|
|
|
|
// StuckConfig holds configurable parameters for stuck-session detection.
|
|
type StuckConfig struct {
|
|
PingTimeout time.Duration `json:"ping_timeout"`
|
|
ConsecutiveFailures int `json:"consecutive_failures"`
|
|
Cooldown time.Duration `json:"cooldown"`
|
|
}
|
|
|
|
// DefaultStuckConfig returns the default stuck detection config.
|
|
func DefaultStuckConfig() *StuckConfig {
|
|
return &StuckConfig{
|
|
PingTimeout: DefaultPingTimeout,
|
|
ConsecutiveFailures: DefaultConsecutiveFailures,
|
|
Cooldown: DefaultCooldown,
|
|
}
|
|
}
|
|
|
|
// LoadStuckConfig loads stuck detection config from the Deacon's role bead.
|
|
// Returns defaults if no role bead exists or if fields aren't configured.
|
|
// Per ZFC: agents control their own thresholds via their role beads.
|
|
func LoadStuckConfig(townRoot string) *StuckConfig {
|
|
config := DefaultStuckConfig()
|
|
|
|
// Load from hq-deacon-role bead
|
|
bd := beads.NewWithBeadsDir(townRoot, beads.ResolveBeadsDir(townRoot))
|
|
roleConfig, err := bd.GetRoleConfig(beads.RoleBeadIDTown("deacon"))
|
|
if err != nil || roleConfig == nil {
|
|
return config
|
|
}
|
|
|
|
// Override defaults with role bead values
|
|
if roleConfig.PingTimeout != "" {
|
|
if d, err := time.ParseDuration(roleConfig.PingTimeout); err == nil {
|
|
config.PingTimeout = d
|
|
}
|
|
}
|
|
if roleConfig.ConsecutiveFailures > 0 {
|
|
config.ConsecutiveFailures = roleConfig.ConsecutiveFailures
|
|
}
|
|
if roleConfig.KillCooldown != "" {
|
|
if d, err := time.ParseDuration(roleConfig.KillCooldown); err == nil {
|
|
config.Cooldown = d
|
|
}
|
|
}
|
|
|
|
return config
|
|
}
|
|
|
|
// AgentHealthState tracks the health check state for a single agent.
|
|
type AgentHealthState struct {
|
|
// AgentID is the identifier (e.g., "gastown/polecats/max" or "deacon")
|
|
AgentID string `json:"agent_id"`
|
|
|
|
// LastPingTime is when we last sent a HEALTH_CHECK nudge
|
|
LastPingTime time.Time `json:"last_ping_time,omitempty"`
|
|
|
|
// LastResponseTime is when the agent last updated their activity
|
|
LastResponseTime time.Time `json:"last_response_time,omitempty"`
|
|
|
|
// ConsecutiveFailures counts how many health checks failed in a row
|
|
ConsecutiveFailures int `json:"consecutive_failures"`
|
|
|
|
// LastForceKillTime is when we last force-killed this agent
|
|
LastForceKillTime time.Time `json:"last_force_kill_time,omitempty"`
|
|
|
|
// ForceKillCount is total number of force-kills for this agent
|
|
ForceKillCount int `json:"force_kill_count"`
|
|
}
|
|
|
|
// HealthCheckState holds health check state for all monitored agents.
|
|
type HealthCheckState struct {
|
|
// Agents maps agent ID to their health state
|
|
Agents map[string]*AgentHealthState `json:"agents"`
|
|
|
|
// LastUpdated is when this state was last written
|
|
LastUpdated time.Time `json:"last_updated"`
|
|
}
|
|
|
|
// HealthCheckStateFile returns the path to the health check state file.
|
|
func HealthCheckStateFile(townRoot string) string {
|
|
return filepath.Join(townRoot, "deacon", "health-check-state.json")
|
|
}
|
|
|
|
// LoadHealthCheckState loads the health check state from disk.
|
|
// Returns empty state if file doesn't exist.
|
|
func LoadHealthCheckState(townRoot string) (*HealthCheckState, error) {
|
|
stateFile := HealthCheckStateFile(townRoot)
|
|
|
|
data, err := os.ReadFile(stateFile) //nolint:gosec // G304: path is constructed from trusted townRoot
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
// Return empty state
|
|
return &HealthCheckState{
|
|
Agents: make(map[string]*AgentHealthState),
|
|
}, nil
|
|
}
|
|
return nil, fmt.Errorf("reading health check state: %w", err)
|
|
}
|
|
|
|
var state HealthCheckState
|
|
if err := json.Unmarshal(data, &state); err != nil {
|
|
return nil, fmt.Errorf("parsing health check state: %w", err)
|
|
}
|
|
|
|
if state.Agents == nil {
|
|
state.Agents = make(map[string]*AgentHealthState)
|
|
}
|
|
|
|
return &state, nil
|
|
}
|
|
|
|
// SaveHealthCheckState saves the health check state to disk.
|
|
func SaveHealthCheckState(townRoot string, state *HealthCheckState) error {
|
|
stateFile := HealthCheckStateFile(townRoot)
|
|
|
|
// Ensure directory exists
|
|
if err := os.MkdirAll(filepath.Dir(stateFile), 0755); err != nil {
|
|
return fmt.Errorf("creating deacon directory: %w", err)
|
|
}
|
|
|
|
state.LastUpdated = time.Now().UTC()
|
|
|
|
data, err := json.MarshalIndent(state, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("marshaling health check state: %w", err)
|
|
}
|
|
|
|
return os.WriteFile(stateFile, data, 0600)
|
|
}
|
|
|
|
// GetAgentState returns the health state for an agent, creating if needed.
|
|
func (s *HealthCheckState) GetAgentState(agentID string) *AgentHealthState {
|
|
if s.Agents == nil {
|
|
s.Agents = make(map[string]*AgentHealthState)
|
|
}
|
|
|
|
state, ok := s.Agents[agentID]
|
|
if !ok {
|
|
state = &AgentHealthState{AgentID: agentID}
|
|
s.Agents[agentID] = state
|
|
}
|
|
return state
|
|
}
|
|
|
|
// HealthCheckResult represents the outcome of a health check.
|
|
type HealthCheckResult struct {
|
|
AgentID string `json:"agent_id"`
|
|
Responded bool `json:"responded"`
|
|
ResponseTime time.Duration `json:"response_time,omitempty"`
|
|
ConsecutiveFailures int `json:"consecutive_failures"`
|
|
ShouldForceKill bool `json:"should_force_kill"`
|
|
InCooldown bool `json:"in_cooldown"`
|
|
CooldownRemaining time.Duration `json:"cooldown_remaining,omitempty"`
|
|
}
|
|
|
|
// Common errors for stuck-session detection.
|
|
var (
|
|
ErrAgentInCooldown = errors.New("agent is in cooldown period after recent force-kill")
|
|
ErrAgentNotFound = errors.New("agent not found or session doesn't exist")
|
|
ErrAgentResponsive = errors.New("agent is responsive, no action needed")
|
|
)
|
|
|
|
// RecordPing records that a health check ping was sent to an agent.
|
|
func (s *AgentHealthState) RecordPing() {
|
|
s.LastPingTime = time.Now().UTC()
|
|
}
|
|
|
|
// RecordResponse records that an agent responded to a health check.
|
|
// This resets the consecutive failure counter.
|
|
func (s *AgentHealthState) RecordResponse() {
|
|
s.LastResponseTime = time.Now().UTC()
|
|
s.ConsecutiveFailures = 0
|
|
}
|
|
|
|
// RecordFailure records that an agent failed to respond to a health check.
|
|
func (s *AgentHealthState) RecordFailure() {
|
|
s.ConsecutiveFailures++
|
|
}
|
|
|
|
// RecordForceKill records that an agent was force-killed.
|
|
func (s *AgentHealthState) RecordForceKill() {
|
|
s.LastForceKillTime = time.Now().UTC()
|
|
s.ForceKillCount++
|
|
s.ConsecutiveFailures = 0 // Reset after kill
|
|
}
|
|
|
|
// IsInCooldown returns true if the agent was recently force-killed.
|
|
func (s *AgentHealthState) IsInCooldown(cooldown time.Duration) bool {
|
|
if s.LastForceKillTime.IsZero() {
|
|
return false
|
|
}
|
|
return time.Since(s.LastForceKillTime) < cooldown
|
|
}
|
|
|
|
// CooldownRemaining returns how long until cooldown expires.
|
|
func (s *AgentHealthState) CooldownRemaining(cooldown time.Duration) time.Duration {
|
|
if s.LastForceKillTime.IsZero() {
|
|
return 0
|
|
}
|
|
remaining := cooldown - time.Since(s.LastForceKillTime)
|
|
if remaining < 0 {
|
|
return 0
|
|
}
|
|
return remaining
|
|
}
|
|
|
|
// ShouldForceKill returns true if the agent has exceeded the failure threshold.
|
|
func (s *AgentHealthState) ShouldForceKill(threshold int) bool {
|
|
return s.ConsecutiveFailures >= threshold
|
|
}
|