fix(zfc): move stuck detection thresholds to agent-controlled config
Per ZFC principle: 'Let agents decide thresholds. Stuck is a judgment call.' Changes: - Add health check threshold fields to RoleConfig (ping_timeout, consecutive_failures, kill_cooldown, stuck_threshold) - Add LoadStuckConfig() to read thresholds from hq-deacon-role bead - Update patrol_check.go to use configurable stuck threshold - Defaults remain as fallbacks when no role bead config exists Agents can now configure their stuck detection by adding fields to their role bead, e.g.: ping_timeout: 45s consecutive_failures: 5 kill_cooldown: 10m stuck_threshold: 2h Fixes: hq-2355b
This commit is contained in:
committed by
Steve Yegge
parent
0f633be4b1
commit
e0858096f6
@@ -528,6 +528,25 @@ type RoleConfig struct {
|
|||||||
// EnvVars are additional environment variables to set in the session.
|
// EnvVars are additional environment variables to set in the session.
|
||||||
// Stored as "key=value" pairs.
|
// Stored as "key=value" pairs.
|
||||||
EnvVars map[string]string
|
EnvVars map[string]string
|
||||||
|
|
||||||
|
// Health check thresholds - per ZFC, agents control their own stuck detection.
|
||||||
|
// These allow the Deacon's patrol config to be agent-defined rather than hardcoded.
|
||||||
|
|
||||||
|
// PingTimeout is how long to wait for a health check response.
|
||||||
|
// Format: duration string (e.g., "30s", "1m"). Default: 30s.
|
||||||
|
PingTimeout string
|
||||||
|
|
||||||
|
// ConsecutiveFailures is how many failed health checks before force-kill.
|
||||||
|
// Default: 3.
|
||||||
|
ConsecutiveFailures int
|
||||||
|
|
||||||
|
// KillCooldown is the minimum time between force-kills of the same agent.
|
||||||
|
// Format: duration string (e.g., "5m", "10m"). Default: 5m.
|
||||||
|
KillCooldown string
|
||||||
|
|
||||||
|
// StuckThreshold is how long a wisp can be in_progress before considered stuck.
|
||||||
|
// Format: duration string (e.g., "1h", "30m"). Default: 1h.
|
||||||
|
StuckThreshold string
|
||||||
}
|
}
|
||||||
|
|
||||||
// ParseRoleConfig extracts RoleConfig from a role bead's description.
|
// ParseRoleConfig extracts RoleConfig from a role bead's description.
|
||||||
@@ -576,6 +595,21 @@ func ParseRoleConfig(description string) *RoleConfig {
|
|||||||
config.EnvVars[envKey] = envVal
|
config.EnvVars[envKey] = envVal
|
||||||
hasFields = true
|
hasFields = true
|
||||||
}
|
}
|
||||||
|
// Health check threshold fields (ZFC: agent-controlled)
|
||||||
|
case "ping_timeout", "ping-timeout", "pingtimeout":
|
||||||
|
config.PingTimeout = value
|
||||||
|
hasFields = true
|
||||||
|
case "consecutive_failures", "consecutive-failures", "consecutivefailures":
|
||||||
|
if n, err := parseIntValue(value); err == nil {
|
||||||
|
config.ConsecutiveFailures = n
|
||||||
|
hasFields = true
|
||||||
|
}
|
||||||
|
case "kill_cooldown", "kill-cooldown", "killcooldown":
|
||||||
|
config.KillCooldown = value
|
||||||
|
hasFields = true
|
||||||
|
case "stuck_threshold", "stuck-threshold", "stuckthreshold":
|
||||||
|
config.StuckThreshold = value
|
||||||
|
hasFields = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -585,6 +619,13 @@ func ParseRoleConfig(description string) *RoleConfig {
|
|||||||
return config
|
return config
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parseIntValue parses an integer from a string value.
|
||||||
|
func parseIntValue(s string) (int, error) {
|
||||||
|
var n int
|
||||||
|
_, err := fmt.Sscanf(s, "%d", &n)
|
||||||
|
return n, err
|
||||||
|
}
|
||||||
|
|
||||||
// FormatRoleConfig formats RoleConfig as a string suitable for a role bead description.
|
// FormatRoleConfig formats RoleConfig as a string suitable for a role bead description.
|
||||||
// Only non-empty/non-default fields are included.
|
// Only non-empty/non-default fields are included.
|
||||||
func FormatRoleConfig(config *RoleConfig) string {
|
func FormatRoleConfig(config *RoleConfig) string {
|
||||||
|
|||||||
@@ -8,13 +8,17 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/steveyegge/gastown/internal/beads"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Default parameters for stuck-session detection.
|
// Default parameters for stuck-session detection.
|
||||||
|
// These are fallbacks when no role bead config exists.
|
||||||
|
// Per ZFC: "Let agents decide thresholds. 'Stuck' is a judgment call."
|
||||||
const (
|
const (
|
||||||
DefaultPingTimeout = 30 * time.Second // How long to wait for response
|
DefaultPingTimeout = 30 * time.Second // How long to wait for response
|
||||||
DefaultConsecutiveFailures = 3 // Failures before force-kill
|
DefaultConsecutiveFailures = 3 // Failures before force-kill
|
||||||
DefaultCooldown = 5 * time.Minute // Minimum time between force-kills
|
DefaultCooldown = 5 * time.Minute // Minimum time between force-kills
|
||||||
)
|
)
|
||||||
|
|
||||||
// StuckConfig holds configurable parameters for stuck-session detection.
|
// StuckConfig holds configurable parameters for stuck-session detection.
|
||||||
@@ -33,6 +37,37 @@ func DefaultStuckConfig() *StuckConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadStuckConfig loads stuck detection config from the Deacon's role bead.
|
||||||
|
// Returns defaults if no role bead exists or if fields aren't configured.
|
||||||
|
// Per ZFC: agents control their own thresholds via their role beads.
|
||||||
|
func LoadStuckConfig(townRoot string) *StuckConfig {
|
||||||
|
config := DefaultStuckConfig()
|
||||||
|
|
||||||
|
// Load from hq-deacon-role bead
|
||||||
|
bd := beads.NewWithBeadsDir(townRoot, beads.ResolveBeadsDir(townRoot))
|
||||||
|
roleConfig, err := bd.GetRoleConfig(beads.RoleBeadIDTown("deacon"))
|
||||||
|
if err != nil || roleConfig == nil {
|
||||||
|
return config
|
||||||
|
}
|
||||||
|
|
||||||
|
// Override defaults with role bead values
|
||||||
|
if roleConfig.PingTimeout != "" {
|
||||||
|
if d, err := time.ParseDuration(roleConfig.PingTimeout); err == nil {
|
||||||
|
config.PingTimeout = d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if roleConfig.ConsecutiveFailures > 0 {
|
||||||
|
config.ConsecutiveFailures = roleConfig.ConsecutiveFailures
|
||||||
|
}
|
||||||
|
if roleConfig.KillCooldown != "" {
|
||||||
|
if d, err := time.ParseDuration(roleConfig.KillCooldown); err == nil {
|
||||||
|
config.Cooldown = d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return config
|
||||||
|
}
|
||||||
|
|
||||||
// AgentHealthState tracks the health check state for a single agent.
|
// AgentHealthState tracks the health check state for a single agent.
|
||||||
type AgentHealthState struct {
|
type AgentHealthState struct {
|
||||||
// AgentID is the identifier (e.g., "gastown/polecats/max" or "deacon")
|
// AgentID is the identifier (e.g., "gastown/polecats/max" or "deacon")
|
||||||
|
|||||||
@@ -219,6 +219,10 @@ type PatrolNotStuckCheck struct {
|
|||||||
stuckThreshold time.Duration
|
stuckThreshold time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DefaultStuckThreshold is the fallback when no role bead config exists.
|
||||||
|
// Per ZFC: "Let agents decide thresholds. 'Stuck' is a judgment call."
|
||||||
|
const DefaultStuckThreshold = 1 * time.Hour
|
||||||
|
|
||||||
// NewPatrolNotStuckCheck creates a new patrol not stuck check.
|
// NewPatrolNotStuckCheck creates a new patrol not stuck check.
|
||||||
func NewPatrolNotStuckCheck() *PatrolNotStuckCheck {
|
func NewPatrolNotStuckCheck() *PatrolNotStuckCheck {
|
||||||
return &PatrolNotStuckCheck{
|
return &PatrolNotStuckCheck{
|
||||||
@@ -226,12 +230,29 @@ func NewPatrolNotStuckCheck() *PatrolNotStuckCheck {
|
|||||||
CheckName: "patrol-not-stuck",
|
CheckName: "patrol-not-stuck",
|
||||||
CheckDescription: "Check for stuck patrol wisps (>1h in_progress)",
|
CheckDescription: "Check for stuck patrol wisps (>1h in_progress)",
|
||||||
},
|
},
|
||||||
stuckThreshold: 1 * time.Hour,
|
stuckThreshold: DefaultStuckThreshold,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loadStuckThreshold loads the stuck threshold from the Deacon's role bead.
|
||||||
|
// Returns the default if no config exists.
|
||||||
|
func loadStuckThreshold(townRoot string) time.Duration {
|
||||||
|
bd := beads.NewWithBeadsDir(townRoot, beads.ResolveBeadsDir(townRoot))
|
||||||
|
roleConfig, err := bd.GetRoleConfig(beads.RoleBeadIDTown("deacon"))
|
||||||
|
if err != nil || roleConfig == nil || roleConfig.StuckThreshold == "" {
|
||||||
|
return DefaultStuckThreshold
|
||||||
|
}
|
||||||
|
if d, err := time.ParseDuration(roleConfig.StuckThreshold); err == nil {
|
||||||
|
return d
|
||||||
|
}
|
||||||
|
return DefaultStuckThreshold
|
||||||
|
}
|
||||||
|
|
||||||
// Run checks for stuck patrol wisps.
|
// Run checks for stuck patrol wisps.
|
||||||
func (c *PatrolNotStuckCheck) Run(ctx *CheckContext) *CheckResult {
|
func (c *PatrolNotStuckCheck) Run(ctx *CheckContext) *CheckResult {
|
||||||
|
// Load threshold from role bead (ZFC: agent-controlled)
|
||||||
|
c.stuckThreshold = loadStuckThreshold(ctx.TownRoot)
|
||||||
|
|
||||||
rigs, err := discoverRigs(ctx.TownRoot)
|
rigs, err := discoverRigs(ctx.TownRoot)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return &CheckResult{
|
return &CheckResult{
|
||||||
@@ -261,11 +282,12 @@ func (c *PatrolNotStuckCheck) Run(ctx *CheckContext) *CheckResult {
|
|||||||
stuckWisps = append(stuckWisps, stuck...)
|
stuckWisps = append(stuckWisps, stuck...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
thresholdStr := c.stuckThreshold.String()
|
||||||
if len(stuckWisps) > 0 {
|
if len(stuckWisps) > 0 {
|
||||||
return &CheckResult{
|
return &CheckResult{
|
||||||
Name: c.Name(),
|
Name: c.Name(),
|
||||||
Status: StatusWarning,
|
Status: StatusWarning,
|
||||||
Message: fmt.Sprintf("%d stuck patrol wisp(s) found (>1h)", len(stuckWisps)),
|
Message: fmt.Sprintf("%d stuck patrol wisp(s) found (>%s)", len(stuckWisps), thresholdStr),
|
||||||
Details: stuckWisps,
|
Details: stuckWisps,
|
||||||
FixHint: "Manual review required - wisps may need to be burned or sessions restarted",
|
FixHint: "Manual review required - wisps may need to be burned or sessions restarted",
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user