Add Deacon stuck-session detection and force-kill protocol

Implements gt-l6ro3.4: Deacon can now detect and kill genuinely stuck/hung
Claude Code sessions during health rounds.

New commands:
- gt deacon health-check <agent>: Pings agent, waits for response, tracks
  consecutive failures. Returns exit code 2 when force-kill threshold reached.
- gt deacon force-kill <agent>: Kills tmux session, updates agent bead state,
  notifies mayor. Respects cooldown period.
- gt deacon health-state: Shows health check state for all monitored agents.

Detection protocol:
1. Send HEALTH_CHECK nudge to agent
2. Wait for agent bead update (configurable timeout, default 30s)
3. Track consecutive failures (default threshold: 3)
4. Recommend force-kill when threshold exceeded

Force-kill protocol:
1. Log intervention (mail to agent)
2. Kill tmux session
3. Update agent bead state to "killed"
4. Notify mayor (optional)

Configurable parameters:
- --timeout: How long to wait for response (default 30s)
- --failures: Consecutive failures before force-kill (default 3)
- --cooldown: Minimum time between force-kills (default 5m)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gastown/polecats/slit
2025-12-30 22:11:07 -08:00
committed by Steve Yegge
parent 108afdbc52
commit 3dd18a1981
3 changed files with 922 additions and 1 deletions

View File

@@ -1,9 +1,11 @@
package cmd
import (
"encoding/json"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
@@ -114,8 +116,84 @@ This command is typically called by the daemon during cold startup.`,
RunE: runDeaconTriggerPending,
}
var deaconHealthCheckCmd = &cobra.Command{
Use: "health-check <agent>",
Short: "Send a health check ping to an agent and track response",
Long: `Send a HEALTH_CHECK nudge to an agent and wait for response.
var triggerTimeout time.Duration
This command is used by the Deacon during health rounds to detect stuck sessions.
It tracks consecutive failures and determines when force-kill is warranted.
The detection protocol:
1. Send HEALTH_CHECK nudge to the agent
2. Wait for agent to update their bead (configurable timeout, default 30s)
3. If no activity update, increment failure counter
4. After N consecutive failures (default 3), recommend force-kill
Exit codes:
0 - Agent responded or is in cooldown (no action needed)
1 - Error occurred
2 - Agent should be force-killed (consecutive failures exceeded)
Examples:
gt deacon health-check gastown/polecats/max
gt deacon health-check gastown/witness --timeout=60s
gt deacon health-check deacon --failures=5`,
Args: cobra.ExactArgs(1),
RunE: runDeaconHealthCheck,
}
var deaconForceKillCmd = &cobra.Command{
Use: "force-kill <agent>",
Short: "Force-kill an unresponsive agent session",
Long: `Force-kill an agent session that has been detected as stuck.
This command is used by the Deacon when an agent fails consecutive health checks.
It performs the force-kill protocol:
1. Log the intervention (send mail to agent)
2. Kill the tmux session
3. Update agent bead state to "killed"
4. Notify mayor (optional, for visibility)
After force-kill, the agent is 'asleep'. Normal wake mechanisms apply:
- gt rig boot restarts it
- Or stays asleep until next activity trigger
This respects the cooldown period - won't kill if recently killed.
Examples:
gt deacon force-kill gastown/polecats/max
gt deacon force-kill gastown/witness --reason="unresponsive for 90s"`,
Args: cobra.ExactArgs(1),
RunE: runDeaconForceKill,
}
var deaconHealthStateCmd = &cobra.Command{
Use: "health-state",
Short: "Show health check state for all monitored agents",
Long: `Display the current health check state including:
- Consecutive failure counts
- Last ping and response times
- Force-kill history and cooldowns
This helps the Deacon understand which agents may need attention.`,
RunE: runDeaconHealthState,
}
var (
triggerTimeout time.Duration
// Health check flags
healthCheckTimeout time.Duration
healthCheckFailures int
healthCheckCooldown time.Duration
// Force kill flags
forceKillReason string
forceKillSkipNotify bool
)
func init() {
deaconCmd.AddCommand(deaconStartCmd)
@@ -125,11 +203,28 @@ func init() {
deaconCmd.AddCommand(deaconRestartCmd)
deaconCmd.AddCommand(deaconHeartbeatCmd)
deaconCmd.AddCommand(deaconTriggerPendingCmd)
deaconCmd.AddCommand(deaconHealthCheckCmd)
deaconCmd.AddCommand(deaconForceKillCmd)
deaconCmd.AddCommand(deaconHealthStateCmd)
// Flags for trigger-pending
deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second,
"Timeout for checking if Claude is ready")
// Flags for health-check
deaconHealthCheckCmd.Flags().DurationVar(&healthCheckTimeout, "timeout", 30*time.Second,
"How long to wait for agent response")
deaconHealthCheckCmd.Flags().IntVar(&healthCheckFailures, "failures", 3,
"Number of consecutive failures before recommending force-kill")
deaconHealthCheckCmd.Flags().DurationVar(&healthCheckCooldown, "cooldown", 5*time.Minute,
"Minimum time between force-kills of same agent")
// Flags for force-kill
deaconForceKillCmd.Flags().StringVar(&forceKillReason, "reason", "",
"Reason for force-kill (included in notifications)")
deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false,
"Skip sending notification mail to mayor")
rootCmd.AddCommand(deaconCmd)
}
@@ -465,3 +560,326 @@ func ensurePatrolHooks(workspacePath string) error {
return os.WriteFile(settingsPath, []byte(hooksJSON), 0600)
}
// runDeaconHealthCheck implements the health-check command.
// It sends a HEALTH_CHECK nudge to an agent, waits for response, and tracks state.
func runDeaconHealthCheck(cmd *cobra.Command, args []string) error {
agent := args[0]
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Load health check state
state, err := deacon.LoadHealthCheckState(townRoot)
if err != nil {
return fmt.Errorf("loading health check state: %w", err)
}
agentState := state.GetAgentState(agent)
// Check if agent is in cooldown
if agentState.IsInCooldown(healthCheckCooldown) {
remaining := agentState.CooldownRemaining(healthCheckCooldown)
fmt.Printf("%s Agent %s is in cooldown (remaining: %s)\n",
style.Dim.Render("○"), agent, remaining.Round(time.Second))
return nil
}
// Get agent bead info before ping (for baseline)
beadID, sessionName, err := agentAddressToIDs(agent)
if err != nil {
return fmt.Errorf("invalid agent address: %w", err)
}
t := tmux.NewTmux()
// Check if session exists
exists, err := t.HasSession(sessionName)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if !exists {
fmt.Printf("%s Agent %s session not running\n", style.Dim.Render("○"), agent)
return nil
}
// Get current bead update time
baselineTime, err := getAgentBeadUpdateTime(townRoot, beadID)
if err != nil {
// Bead might not exist yet - that's okay
baselineTime = time.Time{}
}
// Record ping
agentState.RecordPing()
// Send health check nudge
if err := t.NudgeSession(sessionName, "HEALTH_CHECK: respond with any action to confirm responsiveness"); err != nil {
return fmt.Errorf("sending nudge: %w", err)
}
fmt.Printf("%s Sent HEALTH_CHECK to %s, waiting %s...\n",
style.Bold.Render("→"), agent, healthCheckTimeout)
// Wait for response
deadline := time.Now().Add(healthCheckTimeout)
responded := false
for time.Now().Before(deadline) {
time.Sleep(2 * time.Second) // Check every 2 seconds
newTime, err := getAgentBeadUpdateTime(townRoot, beadID)
if err != nil {
continue
}
// If bead was updated after our baseline, agent responded
if newTime.After(baselineTime) {
responded = true
break
}
}
// Record result
if responded {
agentState.RecordResponse()
if err := deacon.SaveHealthCheckState(townRoot, state); err != nil {
style.PrintWarning("failed to save health check state: %v", err)
}
fmt.Printf("%s Agent %s responded (failures reset to 0)\n",
style.Bold.Render("✓"), agent)
return nil
}
// No response - record failure
agentState.RecordFailure()
if err := deacon.SaveHealthCheckState(townRoot, state); err != nil {
style.PrintWarning("failed to save health check state: %v", err)
}
fmt.Printf("%s Agent %s did not respond (consecutive failures: %d/%d)\n",
style.Dim.Render("⚠"), agent, agentState.ConsecutiveFailures, healthCheckFailures)
// Check if force-kill threshold reached
if agentState.ShouldForceKill(healthCheckFailures) {
fmt.Printf("%s Agent %s should be force-killed\n", style.Bold.Render("✗"), agent)
os.Exit(2) // Exit code 2 = should force-kill
}
return nil
}
// runDeaconForceKill implements the force-kill command.
// It kills a stuck agent session and updates its bead state.
func runDeaconForceKill(cmd *cobra.Command, args []string) error {
agent := args[0]
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Load health check state
state, err := deacon.LoadHealthCheckState(townRoot)
if err != nil {
return fmt.Errorf("loading health check state: %w", err)
}
agentState := state.GetAgentState(agent)
// Check cooldown (unless bypassed)
if agentState.IsInCooldown(healthCheckCooldown) {
remaining := agentState.CooldownRemaining(healthCheckCooldown)
return fmt.Errorf("agent %s is in cooldown (remaining: %s) - cannot force-kill yet",
agent, remaining.Round(time.Second))
}
// Get session name
_, sessionName, err := agentAddressToIDs(agent)
if err != nil {
return fmt.Errorf("invalid agent address: %w", err)
}
t := tmux.NewTmux()
// Check if session exists
exists, err := t.HasSession(sessionName)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if !exists {
fmt.Printf("%s Agent %s session not running\n", style.Dim.Render("○"), agent)
return nil
}
// Build reason
reason := forceKillReason
if reason == "" {
reason = fmt.Sprintf("unresponsive after %d consecutive health check failures",
agentState.ConsecutiveFailures)
}
// Step 1: Log the intervention (send mail to agent)
fmt.Printf("%s Sending force-kill notification to %s...\n", style.Dim.Render("1."), agent)
mailBody := fmt.Sprintf("Deacon detected %s as unresponsive.\nReason: %s\nAction: force-killing session", agent, reason)
sendMail(townRoot, agent, "FORCE_KILL: unresponsive", mailBody)
// Step 2: Kill the tmux session
fmt.Printf("%s Killing tmux session %s...\n", style.Dim.Render("2."), sessionName)
if err := t.KillSession(sessionName); err != nil {
return fmt.Errorf("killing session: %w", err)
}
// Step 3: Update agent bead state (optional - best effort)
fmt.Printf("%s Updating agent bead state to 'killed'...\n", style.Dim.Render("3."))
updateAgentBeadState(townRoot, agent, "killed", reason)
// Step 4: Notify mayor (optional)
if !forceKillSkipNotify {
fmt.Printf("%s Notifying mayor...\n", style.Dim.Render("4."))
notifyBody := fmt.Sprintf("Agent %s was force-killed by Deacon.\nReason: %s", agent, reason)
sendMail(townRoot, "mayor/", "Agent killed: "+agent, notifyBody)
}
// Record force-kill in state
agentState.RecordForceKill()
if err := deacon.SaveHealthCheckState(townRoot, state); err != nil {
style.PrintWarning("failed to save health check state: %v", err)
}
fmt.Printf("%s Force-killed agent %s (total kills: %d)\n",
style.Bold.Render("✓"), agent, agentState.ForceKillCount)
fmt.Printf(" %s\n", style.Dim.Render("Agent is now 'asleep'. Use 'gt rig boot' to restart."))
return nil
}
// runDeaconHealthState shows the current health check state.
func runDeaconHealthState(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
state, err := deacon.LoadHealthCheckState(townRoot)
if err != nil {
return fmt.Errorf("loading health check state: %w", err)
}
if len(state.Agents) == 0 {
fmt.Printf("%s No health check state recorded yet\n", style.Dim.Render("○"))
return nil
}
fmt.Printf("%s Health Check State (updated %s)\n\n",
style.Bold.Render("●"),
state.LastUpdated.Format(time.RFC3339))
for agentID, agentState := range state.Agents {
fmt.Printf("Agent: %s\n", style.Bold.Render(agentID))
if !agentState.LastPingTime.IsZero() {
fmt.Printf(" Last ping: %s ago\n", time.Since(agentState.LastPingTime).Round(time.Second))
}
if !agentState.LastResponseTime.IsZero() {
fmt.Printf(" Last response: %s ago\n", time.Since(agentState.LastResponseTime).Round(time.Second))
}
fmt.Printf(" Consecutive failures: %d\n", agentState.ConsecutiveFailures)
fmt.Printf(" Total force-kills: %d\n", agentState.ForceKillCount)
if !agentState.LastForceKillTime.IsZero() {
fmt.Printf(" Last force-kill: %s ago\n", time.Since(agentState.LastForceKillTime).Round(time.Second))
if agentState.IsInCooldown(healthCheckCooldown) {
remaining := agentState.CooldownRemaining(healthCheckCooldown)
fmt.Printf(" Cooldown: %s remaining\n", remaining.Round(time.Second))
}
}
fmt.Println()
}
return nil
}
// agentAddressToIDs converts an agent address to bead ID and session name.
// Supports formats: "gastown/polecats/max", "gastown/witness", "deacon", "mayor"
func agentAddressToIDs(address string) (beadID, sessionName string, err error) {
switch address {
case "deacon":
return "gt-deacon", DeaconSessionName, nil
case "mayor":
return "gt-mayor", "gt-mayor", nil
}
parts := strings.Split(address, "/")
switch len(parts) {
case 2:
// rig/role: "gastown/witness", "gastown/refinery"
rig, role := parts[0], parts[1]
switch role {
case "witness":
return fmt.Sprintf("gt-%s-witness", rig), fmt.Sprintf("gt-%s-witness", rig), nil
case "refinery":
return fmt.Sprintf("gt-%s-refinery", rig), fmt.Sprintf("gt-%s-refinery", rig), nil
default:
return "", "", fmt.Errorf("unknown role: %s", role)
}
case 3:
// rig/type/name: "gastown/polecats/max", "gastown/crew/alpha"
rig, agentType, name := parts[0], parts[1], parts[2]
switch agentType {
case "polecats":
return fmt.Sprintf("gt-%s-polecat-%s", rig, name), fmt.Sprintf("gt-%s-%s", rig, name), nil
case "crew":
return fmt.Sprintf("gt-%s-crew-%s", rig, name), fmt.Sprintf("gt-%s-crew-%s", rig, name), nil
default:
return "", "", fmt.Errorf("unknown agent type: %s", agentType)
}
default:
return "", "", fmt.Errorf("invalid agent address format: %s (expected rig/type/name or rig/role)", address)
}
}
// getAgentBeadUpdateTime gets the update time from an agent bead.
func getAgentBeadUpdateTime(townRoot, beadID string) (time.Time, error) {
cmd := exec.Command("bd", "show", beadID, "--json")
cmd.Dir = townRoot
output, err := cmd.Output()
if err != nil {
return time.Time{}, err
}
var issues []struct {
UpdatedAt string `json:"updated_at"`
}
if err := json.Unmarshal(output, &issues); err != nil {
return time.Time{}, err
}
if len(issues) == 0 {
return time.Time{}, fmt.Errorf("bead not found: %s", beadID)
}
return time.Parse(time.RFC3339, issues[0].UpdatedAt)
}
// sendMail sends a mail message using gt mail send.
func sendMail(townRoot, to, subject, body string) {
cmd := exec.Command("gt", "mail", "send", to, "-s", subject, "-m", body)
cmd.Dir = townRoot
_ = cmd.Run() // Best effort
}
// updateAgentBeadState updates an agent bead's state.
func updateAgentBeadState(townRoot, agent, state, reason string) {
beadID, _, err := agentAddressToIDs(agent)
if err != nil {
return
}
// Use bd agent state command
cmd := exec.Command("bd", "agent", "state", beadID, state)
cmd.Dir = townRoot
_ = cmd.Run() // Best effort
}

197
internal/deacon/stuck.go Normal file
View File

@@ -0,0 +1,197 @@
// Package deacon provides the Deacon agent infrastructure.
package deacon
import (
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"time"
)
// Default parameters for stuck-session detection.
const (
DefaultPingTimeout = 30 * time.Second // How long to wait for response
DefaultConsecutiveFailures = 3 // Failures before force-kill
DefaultCooldown = 5 * time.Minute // Minimum time between force-kills
)
// StuckConfig holds configurable parameters for stuck-session detection.
type StuckConfig struct {
PingTimeout time.Duration `json:"ping_timeout"`
ConsecutiveFailures int `json:"consecutive_failures"`
Cooldown time.Duration `json:"cooldown"`
}
// DefaultStuckConfig returns the default stuck detection config.
func DefaultStuckConfig() *StuckConfig {
return &StuckConfig{
PingTimeout: DefaultPingTimeout,
ConsecutiveFailures: DefaultConsecutiveFailures,
Cooldown: DefaultCooldown,
}
}
// AgentHealthState tracks the health check state for a single agent.
type AgentHealthState struct {
// AgentID is the identifier (e.g., "gastown/polecats/max" or "deacon")
AgentID string `json:"agent_id"`
// LastPingTime is when we last sent a HEALTH_CHECK nudge
LastPingTime time.Time `json:"last_ping_time,omitempty"`
// LastResponseTime is when the agent last updated their activity
LastResponseTime time.Time `json:"last_response_time,omitempty"`
// ConsecutiveFailures counts how many health checks failed in a row
ConsecutiveFailures int `json:"consecutive_failures"`
// LastForceKillTime is when we last force-killed this agent
LastForceKillTime time.Time `json:"last_force_kill_time,omitempty"`
// ForceKillCount is total number of force-kills for this agent
ForceKillCount int `json:"force_kill_count"`
}
// HealthCheckState holds health check state for all monitored agents.
type HealthCheckState struct {
// Agents maps agent ID to their health state
Agents map[string]*AgentHealthState `json:"agents"`
// LastUpdated is when this state was last written
LastUpdated time.Time `json:"last_updated"`
}
// HealthCheckStateFile returns the path to the health check state file.
func HealthCheckStateFile(townRoot string) string {
return filepath.Join(townRoot, "deacon", "health-check-state.json")
}
// LoadHealthCheckState loads the health check state from disk.
// Returns empty state if file doesn't exist.
func LoadHealthCheckState(townRoot string) (*HealthCheckState, error) {
stateFile := HealthCheckStateFile(townRoot)
data, err := os.ReadFile(stateFile)
if err != nil {
if os.IsNotExist(err) {
// Return empty state
return &HealthCheckState{
Agents: make(map[string]*AgentHealthState),
}, nil
}
return nil, fmt.Errorf("reading health check state: %w", err)
}
var state HealthCheckState
if err := json.Unmarshal(data, &state); err != nil {
return nil, fmt.Errorf("parsing health check state: %w", err)
}
if state.Agents == nil {
state.Agents = make(map[string]*AgentHealthState)
}
return &state, nil
}
// SaveHealthCheckState saves the health check state to disk.
func SaveHealthCheckState(townRoot string, state *HealthCheckState) error {
stateFile := HealthCheckStateFile(townRoot)
// Ensure directory exists
if err := os.MkdirAll(filepath.Dir(stateFile), 0755); err != nil {
return fmt.Errorf("creating deacon directory: %w", err)
}
state.LastUpdated = time.Now().UTC()
data, err := json.MarshalIndent(state, "", " ")
if err != nil {
return fmt.Errorf("marshaling health check state: %w", err)
}
return os.WriteFile(stateFile, data, 0644)
}
// GetAgentState returns the health state for an agent, creating if needed.
func (s *HealthCheckState) GetAgentState(agentID string) *AgentHealthState {
if s.Agents == nil {
s.Agents = make(map[string]*AgentHealthState)
}
state, ok := s.Agents[agentID]
if !ok {
state = &AgentHealthState{AgentID: agentID}
s.Agents[agentID] = state
}
return state
}
// HealthCheckResult represents the outcome of a health check.
type HealthCheckResult struct {
AgentID string `json:"agent_id"`
Responded bool `json:"responded"`
ResponseTime time.Duration `json:"response_time,omitempty"`
ConsecutiveFailures int `json:"consecutive_failures"`
ShouldForceKill bool `json:"should_force_kill"`
InCooldown bool `json:"in_cooldown"`
CooldownRemaining time.Duration `json:"cooldown_remaining,omitempty"`
}
// Common errors for stuck-session detection.
var (
ErrAgentInCooldown = errors.New("agent is in cooldown period after recent force-kill")
ErrAgentNotFound = errors.New("agent not found or session doesn't exist")
ErrAgentResponsive = errors.New("agent is responsive, no action needed")
)
// RecordPing records that a health check ping was sent to an agent.
func (s *AgentHealthState) RecordPing() {
s.LastPingTime = time.Now().UTC()
}
// RecordResponse records that an agent responded to a health check.
// This resets the consecutive failure counter.
func (s *AgentHealthState) RecordResponse() {
s.LastResponseTime = time.Now().UTC()
s.ConsecutiveFailures = 0
}
// RecordFailure records that an agent failed to respond to a health check.
func (s *AgentHealthState) RecordFailure() {
s.ConsecutiveFailures++
}
// RecordForceKill records that an agent was force-killed.
func (s *AgentHealthState) RecordForceKill() {
s.LastForceKillTime = time.Now().UTC()
s.ForceKillCount++
s.ConsecutiveFailures = 0 // Reset after kill
}
// IsInCooldown returns true if the agent was recently force-killed.
func (s *AgentHealthState) IsInCooldown(cooldown time.Duration) bool {
if s.LastForceKillTime.IsZero() {
return false
}
return time.Since(s.LastForceKillTime) < cooldown
}
// CooldownRemaining returns how long until cooldown expires.
func (s *AgentHealthState) CooldownRemaining(cooldown time.Duration) time.Duration {
if s.LastForceKillTime.IsZero() {
return 0
}
remaining := cooldown - time.Since(s.LastForceKillTime)
if remaining < 0 {
return 0
}
return remaining
}
// ShouldForceKill returns true if the agent has exceeded the failure threshold.
func (s *AgentHealthState) ShouldForceKill(threshold int) bool {
return s.ConsecutiveFailures >= threshold
}

View File

@@ -0,0 +1,306 @@
package deacon
import (
"os"
"path/filepath"
"testing"
"time"
)
func TestDefaultStuckConfig(t *testing.T) {
config := DefaultStuckConfig()
if config.PingTimeout != DefaultPingTimeout {
t.Errorf("PingTimeout = %v, want %v", config.PingTimeout, DefaultPingTimeout)
}
if config.ConsecutiveFailures != DefaultConsecutiveFailures {
t.Errorf("ConsecutiveFailures = %v, want %v", config.ConsecutiveFailures, DefaultConsecutiveFailures)
}
if config.Cooldown != DefaultCooldown {
t.Errorf("Cooldown = %v, want %v", config.Cooldown, DefaultCooldown)
}
}
func TestHealthCheckStateFile(t *testing.T) {
path := HealthCheckStateFile("/tmp/test-town")
expected := "/tmp/test-town/deacon/health-check-state.json"
if path != expected {
t.Errorf("HealthCheckStateFile = %q, want %q", path, expected)
}
}
func TestLoadHealthCheckState_NonExistent(t *testing.T) {
tmpDir := t.TempDir()
state, err := LoadHealthCheckState(tmpDir)
if err != nil {
t.Fatalf("LoadHealthCheckState() error = %v", err)
}
if state.Agents == nil {
t.Error("Agents map should be initialized")
}
if len(state.Agents) != 0 {
t.Errorf("Expected empty agents map, got %d entries", len(state.Agents))
}
}
func TestSaveAndLoadHealthCheckState(t *testing.T) {
tmpDir := t.TempDir()
// Create state with some data
state := &HealthCheckState{
Agents: map[string]*AgentHealthState{
"gastown/polecats/max": {
AgentID: "gastown/polecats/max",
ConsecutiveFailures: 2,
ForceKillCount: 1,
},
},
}
// Save
if err := SaveHealthCheckState(tmpDir, state); err != nil {
t.Fatalf("SaveHealthCheckState() error = %v", err)
}
// Verify file exists
stateFile := HealthCheckStateFile(tmpDir)
if _, err := os.Stat(stateFile); os.IsNotExist(err) {
t.Fatal("State file was not created")
}
// Load
loaded, err := LoadHealthCheckState(tmpDir)
if err != nil {
t.Fatalf("LoadHealthCheckState() error = %v", err)
}
// Verify loaded data
agent := loaded.Agents["gastown/polecats/max"]
if agent == nil {
t.Fatal("Agent not found in loaded state")
}
if agent.ConsecutiveFailures != 2 {
t.Errorf("ConsecutiveFailures = %d, want 2", agent.ConsecutiveFailures)
}
if agent.ForceKillCount != 1 {
t.Errorf("ForceKillCount = %d, want 1", agent.ForceKillCount)
}
}
func TestGetAgentState(t *testing.T) {
state := &HealthCheckState{}
// First call creates the agent
agent1 := state.GetAgentState("test/agent")
if agent1 == nil {
t.Fatal("GetAgentState returned nil")
}
if agent1.AgentID != "test/agent" {
t.Errorf("AgentID = %q, want %q", agent1.AgentID, "test/agent")
}
// Second call returns same agent
agent2 := state.GetAgentState("test/agent")
if agent1 != agent2 {
t.Error("GetAgentState should return the same pointer")
}
}
func TestAgentHealthState_RecordPing(t *testing.T) {
agent := &AgentHealthState{}
before := time.Now()
agent.RecordPing()
after := time.Now()
if agent.LastPingTime.Before(before) || agent.LastPingTime.After(after) {
t.Error("LastPingTime should be set to current time")
}
}
func TestAgentHealthState_RecordResponse(t *testing.T) {
agent := &AgentHealthState{
ConsecutiveFailures: 5,
}
before := time.Now()
agent.RecordResponse()
after := time.Now()
if agent.LastResponseTime.Before(before) || agent.LastResponseTime.After(after) {
t.Error("LastResponseTime should be set to current time")
}
if agent.ConsecutiveFailures != 0 {
t.Errorf("ConsecutiveFailures should be reset to 0, got %d", agent.ConsecutiveFailures)
}
}
func TestAgentHealthState_RecordFailure(t *testing.T) {
agent := &AgentHealthState{
ConsecutiveFailures: 2,
}
agent.RecordFailure()
if agent.ConsecutiveFailures != 3 {
t.Errorf("ConsecutiveFailures = %d, want 3", agent.ConsecutiveFailures)
}
}
func TestAgentHealthState_RecordForceKill(t *testing.T) {
agent := &AgentHealthState{
ConsecutiveFailures: 5,
ForceKillCount: 2,
}
before := time.Now()
agent.RecordForceKill()
after := time.Now()
if agent.LastForceKillTime.Before(before) || agent.LastForceKillTime.After(after) {
t.Error("LastForceKillTime should be set to current time")
}
if agent.ForceKillCount != 3 {
t.Errorf("ForceKillCount = %d, want 3", agent.ForceKillCount)
}
if agent.ConsecutiveFailures != 0 {
t.Errorf("ConsecutiveFailures should be reset to 0, got %d", agent.ConsecutiveFailures)
}
}
func TestAgentHealthState_IsInCooldown(t *testing.T) {
cooldown := 5 * time.Minute
tests := []struct {
name string
lastForceKillTime time.Time
want bool
}{
{
name: "no force-kill history",
lastForceKillTime: time.Time{},
want: false,
},
{
name: "recently killed",
lastForceKillTime: time.Now().Add(-1 * time.Minute),
want: true,
},
{
name: "cooldown expired",
lastForceKillTime: time.Now().Add(-10 * time.Minute),
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
agent := &AgentHealthState{
LastForceKillTime: tt.lastForceKillTime,
}
if got := agent.IsInCooldown(cooldown); got != tt.want {
t.Errorf("IsInCooldown() = %v, want %v", got, tt.want)
}
})
}
}
func TestAgentHealthState_CooldownRemaining(t *testing.T) {
cooldown := 5 * time.Minute
tests := []struct {
name string
lastForceKillTime time.Time
wantZero bool
}{
{
name: "no force-kill history",
lastForceKillTime: time.Time{},
wantZero: true,
},
{
name: "recently killed",
lastForceKillTime: time.Now().Add(-1 * time.Minute),
wantZero: false,
},
{
name: "cooldown expired",
lastForceKillTime: time.Now().Add(-10 * time.Minute),
wantZero: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
agent := &AgentHealthState{
LastForceKillTime: tt.lastForceKillTime,
}
got := agent.CooldownRemaining(cooldown)
if tt.wantZero && got != 0 {
t.Errorf("CooldownRemaining() = %v, want 0", got)
}
if !tt.wantZero && got == 0 {
t.Error("CooldownRemaining() = 0, want non-zero")
}
})
}
}
func TestAgentHealthState_ShouldForceKill(t *testing.T) {
tests := []struct {
name string
failures int
threshold int
want bool
}{
{
name: "below threshold",
failures: 2,
threshold: 3,
want: false,
},
{
name: "at threshold",
failures: 3,
threshold: 3,
want: true,
},
{
name: "above threshold",
failures: 5,
threshold: 3,
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
agent := &AgentHealthState{
ConsecutiveFailures: tt.failures,
}
if got := agent.ShouldForceKill(tt.threshold); got != tt.want {
t.Errorf("ShouldForceKill() = %v, want %v", got, tt.want)
}
})
}
}
func TestSaveHealthCheckState_CreatesDirectory(t *testing.T) {
tmpDir := t.TempDir()
nestedDir := filepath.Join(tmpDir, "nonexistent", "deacon")
state := &HealthCheckState{
Agents: make(map[string]*AgentHealthState),
}
// Should create the directory structure
if err := SaveHealthCheckState(filepath.Join(tmpDir, "nonexistent"), state); err != nil {
t.Fatalf("SaveHealthCheckState() error = %v", err)
}
// Verify directory was created
if _, err := os.Stat(nestedDir); os.IsNotExist(err) {
t.Error("Directory should have been created")
}
}