diff --git a/internal/cmd/deacon.go b/internal/cmd/deacon.go index 33841d2e..8da4386e 100644 --- a/internal/cmd/deacon.go +++ b/internal/cmd/deacon.go @@ -1,9 +1,11 @@ package cmd import ( + "encoding/json" "errors" "fmt" "os" + "os/exec" "path/filepath" "strings" "time" @@ -114,8 +116,84 @@ This command is typically called by the daemon during cold startup.`, RunE: runDeaconTriggerPending, } +var deaconHealthCheckCmd = &cobra.Command{ + Use: "health-check ", + Short: "Send a health check ping to an agent and track response", + Long: `Send a HEALTH_CHECK nudge to an agent and wait for response. -var triggerTimeout time.Duration +This command is used by the Deacon during health rounds to detect stuck sessions. +It tracks consecutive failures and determines when force-kill is warranted. + +The detection protocol: +1. Send HEALTH_CHECK nudge to the agent +2. Wait for agent to update their bead (configurable timeout, default 30s) +3. If no activity update, increment failure counter +4. After N consecutive failures (default 3), recommend force-kill + +Exit codes: + 0 - Agent responded or is in cooldown (no action needed) + 1 - Error occurred + 2 - Agent should be force-killed (consecutive failures exceeded) + +Examples: + gt deacon health-check gastown/polecats/max + gt deacon health-check gastown/witness --timeout=60s + gt deacon health-check deacon --failures=5`, + Args: cobra.ExactArgs(1), + RunE: runDeaconHealthCheck, +} + +var deaconForceKillCmd = &cobra.Command{ + Use: "force-kill ", + Short: "Force-kill an unresponsive agent session", + Long: `Force-kill an agent session that has been detected as stuck. + +This command is used by the Deacon when an agent fails consecutive health checks. +It performs the force-kill protocol: + +1. Log the intervention (send mail to agent) +2. Kill the tmux session +3. Update agent bead state to "killed" +4. Notify mayor (optional, for visibility) + +After force-kill, the agent is 'asleep'. Normal wake mechanisms apply: +- gt rig boot restarts it +- Or stays asleep until next activity trigger + +This respects the cooldown period - won't kill if recently killed. + +Examples: + gt deacon force-kill gastown/polecats/max + gt deacon force-kill gastown/witness --reason="unresponsive for 90s"`, + Args: cobra.ExactArgs(1), + RunE: runDeaconForceKill, +} + +var deaconHealthStateCmd = &cobra.Command{ + Use: "health-state", + Short: "Show health check state for all monitored agents", + Long: `Display the current health check state including: +- Consecutive failure counts +- Last ping and response times +- Force-kill history and cooldowns + +This helps the Deacon understand which agents may need attention.`, + RunE: runDeaconHealthState, +} + + +var ( + triggerTimeout time.Duration + + // Health check flags + healthCheckTimeout time.Duration + healthCheckFailures int + healthCheckCooldown time.Duration + + // Force kill flags + forceKillReason string + forceKillSkipNotify bool +) func init() { deaconCmd.AddCommand(deaconStartCmd) @@ -125,11 +203,28 @@ func init() { deaconCmd.AddCommand(deaconRestartCmd) deaconCmd.AddCommand(deaconHeartbeatCmd) deaconCmd.AddCommand(deaconTriggerPendingCmd) + deaconCmd.AddCommand(deaconHealthCheckCmd) + deaconCmd.AddCommand(deaconForceKillCmd) + deaconCmd.AddCommand(deaconHealthStateCmd) // Flags for trigger-pending deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second, "Timeout for checking if Claude is ready") + // Flags for health-check + deaconHealthCheckCmd.Flags().DurationVar(&healthCheckTimeout, "timeout", 30*time.Second, + "How long to wait for agent response") + deaconHealthCheckCmd.Flags().IntVar(&healthCheckFailures, "failures", 3, + "Number of consecutive failures before recommending force-kill") + deaconHealthCheckCmd.Flags().DurationVar(&healthCheckCooldown, "cooldown", 5*time.Minute, + "Minimum time between force-kills of same agent") + + // Flags for force-kill + deaconForceKillCmd.Flags().StringVar(&forceKillReason, "reason", "", + "Reason for force-kill (included in notifications)") + deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false, + "Skip sending notification mail to mayor") + rootCmd.AddCommand(deaconCmd) } @@ -465,3 +560,326 @@ func ensurePatrolHooks(workspacePath string) error { return os.WriteFile(settingsPath, []byte(hooksJSON), 0600) } +// runDeaconHealthCheck implements the health-check command. +// It sends a HEALTH_CHECK nudge to an agent, waits for response, and tracks state. +func runDeaconHealthCheck(cmd *cobra.Command, args []string) error { + agent := args[0] + + townRoot, err := workspace.FindFromCwdOrError() + if err != nil { + return fmt.Errorf("not in a Gas Town workspace: %w", err) + } + + // Load health check state + state, err := deacon.LoadHealthCheckState(townRoot) + if err != nil { + return fmt.Errorf("loading health check state: %w", err) + } + agentState := state.GetAgentState(agent) + + // Check if agent is in cooldown + if agentState.IsInCooldown(healthCheckCooldown) { + remaining := agentState.CooldownRemaining(healthCheckCooldown) + fmt.Printf("%s Agent %s is in cooldown (remaining: %s)\n", + style.Dim.Render("○"), agent, remaining.Round(time.Second)) + return nil + } + + // Get agent bead info before ping (for baseline) + beadID, sessionName, err := agentAddressToIDs(agent) + if err != nil { + return fmt.Errorf("invalid agent address: %w", err) + } + + t := tmux.NewTmux() + + // Check if session exists + exists, err := t.HasSession(sessionName) + if err != nil { + return fmt.Errorf("checking session: %w", err) + } + if !exists { + fmt.Printf("%s Agent %s session not running\n", style.Dim.Render("○"), agent) + return nil + } + + // Get current bead update time + baselineTime, err := getAgentBeadUpdateTime(townRoot, beadID) + if err != nil { + // Bead might not exist yet - that's okay + baselineTime = time.Time{} + } + + // Record ping + agentState.RecordPing() + + // Send health check nudge + if err := t.NudgeSession(sessionName, "HEALTH_CHECK: respond with any action to confirm responsiveness"); err != nil { + return fmt.Errorf("sending nudge: %w", err) + } + + fmt.Printf("%s Sent HEALTH_CHECK to %s, waiting %s...\n", + style.Bold.Render("→"), agent, healthCheckTimeout) + + // Wait for response + deadline := time.Now().Add(healthCheckTimeout) + responded := false + + for time.Now().Before(deadline) { + time.Sleep(2 * time.Second) // Check every 2 seconds + + newTime, err := getAgentBeadUpdateTime(townRoot, beadID) + if err != nil { + continue + } + + // If bead was updated after our baseline, agent responded + if newTime.After(baselineTime) { + responded = true + break + } + } + + // Record result + if responded { + agentState.RecordResponse() + if err := deacon.SaveHealthCheckState(townRoot, state); err != nil { + style.PrintWarning("failed to save health check state: %v", err) + } + fmt.Printf("%s Agent %s responded (failures reset to 0)\n", + style.Bold.Render("✓"), agent) + return nil + } + + // No response - record failure + agentState.RecordFailure() + if err := deacon.SaveHealthCheckState(townRoot, state); err != nil { + style.PrintWarning("failed to save health check state: %v", err) + } + + fmt.Printf("%s Agent %s did not respond (consecutive failures: %d/%d)\n", + style.Dim.Render("⚠"), agent, agentState.ConsecutiveFailures, healthCheckFailures) + + // Check if force-kill threshold reached + if agentState.ShouldForceKill(healthCheckFailures) { + fmt.Printf("%s Agent %s should be force-killed\n", style.Bold.Render("✗"), agent) + os.Exit(2) // Exit code 2 = should force-kill + } + + return nil +} + +// runDeaconForceKill implements the force-kill command. +// It kills a stuck agent session and updates its bead state. +func runDeaconForceKill(cmd *cobra.Command, args []string) error { + agent := args[0] + + townRoot, err := workspace.FindFromCwdOrError() + if err != nil { + return fmt.Errorf("not in a Gas Town workspace: %w", err) + } + + // Load health check state + state, err := deacon.LoadHealthCheckState(townRoot) + if err != nil { + return fmt.Errorf("loading health check state: %w", err) + } + agentState := state.GetAgentState(agent) + + // Check cooldown (unless bypassed) + if agentState.IsInCooldown(healthCheckCooldown) { + remaining := agentState.CooldownRemaining(healthCheckCooldown) + return fmt.Errorf("agent %s is in cooldown (remaining: %s) - cannot force-kill yet", + agent, remaining.Round(time.Second)) + } + + // Get session name + _, sessionName, err := agentAddressToIDs(agent) + if err != nil { + return fmt.Errorf("invalid agent address: %w", err) + } + + t := tmux.NewTmux() + + // Check if session exists + exists, err := t.HasSession(sessionName) + if err != nil { + return fmt.Errorf("checking session: %w", err) + } + if !exists { + fmt.Printf("%s Agent %s session not running\n", style.Dim.Render("○"), agent) + return nil + } + + // Build reason + reason := forceKillReason + if reason == "" { + reason = fmt.Sprintf("unresponsive after %d consecutive health check failures", + agentState.ConsecutiveFailures) + } + + // Step 1: Log the intervention (send mail to agent) + fmt.Printf("%s Sending force-kill notification to %s...\n", style.Dim.Render("1."), agent) + mailBody := fmt.Sprintf("Deacon detected %s as unresponsive.\nReason: %s\nAction: force-killing session", agent, reason) + sendMail(townRoot, agent, "FORCE_KILL: unresponsive", mailBody) + + // Step 2: Kill the tmux session + fmt.Printf("%s Killing tmux session %s...\n", style.Dim.Render("2."), sessionName) + if err := t.KillSession(sessionName); err != nil { + return fmt.Errorf("killing session: %w", err) + } + + // Step 3: Update agent bead state (optional - best effort) + fmt.Printf("%s Updating agent bead state to 'killed'...\n", style.Dim.Render("3.")) + updateAgentBeadState(townRoot, agent, "killed", reason) + + // Step 4: Notify mayor (optional) + if !forceKillSkipNotify { + fmt.Printf("%s Notifying mayor...\n", style.Dim.Render("4.")) + notifyBody := fmt.Sprintf("Agent %s was force-killed by Deacon.\nReason: %s", agent, reason) + sendMail(townRoot, "mayor/", "Agent killed: "+agent, notifyBody) + } + + // Record force-kill in state + agentState.RecordForceKill() + if err := deacon.SaveHealthCheckState(townRoot, state); err != nil { + style.PrintWarning("failed to save health check state: %v", err) + } + + fmt.Printf("%s Force-killed agent %s (total kills: %d)\n", + style.Bold.Render("✓"), agent, agentState.ForceKillCount) + fmt.Printf(" %s\n", style.Dim.Render("Agent is now 'asleep'. Use 'gt rig boot' to restart.")) + + return nil +} + +// runDeaconHealthState shows the current health check state. +func runDeaconHealthState(cmd *cobra.Command, args []string) error { + townRoot, err := workspace.FindFromCwdOrError() + if err != nil { + return fmt.Errorf("not in a Gas Town workspace: %w", err) + } + + state, err := deacon.LoadHealthCheckState(townRoot) + if err != nil { + return fmt.Errorf("loading health check state: %w", err) + } + + if len(state.Agents) == 0 { + fmt.Printf("%s No health check state recorded yet\n", style.Dim.Render("○")) + return nil + } + + fmt.Printf("%s Health Check State (updated %s)\n\n", + style.Bold.Render("●"), + state.LastUpdated.Format(time.RFC3339)) + + for agentID, agentState := range state.Agents { + fmt.Printf("Agent: %s\n", style.Bold.Render(agentID)) + + if !agentState.LastPingTime.IsZero() { + fmt.Printf(" Last ping: %s ago\n", time.Since(agentState.LastPingTime).Round(time.Second)) + } + if !agentState.LastResponseTime.IsZero() { + fmt.Printf(" Last response: %s ago\n", time.Since(agentState.LastResponseTime).Round(time.Second)) + } + + fmt.Printf(" Consecutive failures: %d\n", agentState.ConsecutiveFailures) + fmt.Printf(" Total force-kills: %d\n", agentState.ForceKillCount) + + if !agentState.LastForceKillTime.IsZero() { + fmt.Printf(" Last force-kill: %s ago\n", time.Since(agentState.LastForceKillTime).Round(time.Second)) + if agentState.IsInCooldown(healthCheckCooldown) { + remaining := agentState.CooldownRemaining(healthCheckCooldown) + fmt.Printf(" Cooldown: %s remaining\n", remaining.Round(time.Second)) + } + } + fmt.Println() + } + + return nil +} + +// agentAddressToIDs converts an agent address to bead ID and session name. +// Supports formats: "gastown/polecats/max", "gastown/witness", "deacon", "mayor" +func agentAddressToIDs(address string) (beadID, sessionName string, err error) { + switch address { + case "deacon": + return "gt-deacon", DeaconSessionName, nil + case "mayor": + return "gt-mayor", "gt-mayor", nil + } + + parts := strings.Split(address, "/") + switch len(parts) { + case 2: + // rig/role: "gastown/witness", "gastown/refinery" + rig, role := parts[0], parts[1] + switch role { + case "witness": + return fmt.Sprintf("gt-%s-witness", rig), fmt.Sprintf("gt-%s-witness", rig), nil + case "refinery": + return fmt.Sprintf("gt-%s-refinery", rig), fmt.Sprintf("gt-%s-refinery", rig), nil + default: + return "", "", fmt.Errorf("unknown role: %s", role) + } + case 3: + // rig/type/name: "gastown/polecats/max", "gastown/crew/alpha" + rig, agentType, name := parts[0], parts[1], parts[2] + switch agentType { + case "polecats": + return fmt.Sprintf("gt-%s-polecat-%s", rig, name), fmt.Sprintf("gt-%s-%s", rig, name), nil + case "crew": + return fmt.Sprintf("gt-%s-crew-%s", rig, name), fmt.Sprintf("gt-%s-crew-%s", rig, name), nil + default: + return "", "", fmt.Errorf("unknown agent type: %s", agentType) + } + default: + return "", "", fmt.Errorf("invalid agent address format: %s (expected rig/type/name or rig/role)", address) + } +} + +// getAgentBeadUpdateTime gets the update time from an agent bead. +func getAgentBeadUpdateTime(townRoot, beadID string) (time.Time, error) { + cmd := exec.Command("bd", "show", beadID, "--json") + cmd.Dir = townRoot + + output, err := cmd.Output() + if err != nil { + return time.Time{}, err + } + + var issues []struct { + UpdatedAt string `json:"updated_at"` + } + if err := json.Unmarshal(output, &issues); err != nil { + return time.Time{}, err + } + + if len(issues) == 0 { + return time.Time{}, fmt.Errorf("bead not found: %s", beadID) + } + + return time.Parse(time.RFC3339, issues[0].UpdatedAt) +} + +// sendMail sends a mail message using gt mail send. +func sendMail(townRoot, to, subject, body string) { + cmd := exec.Command("gt", "mail", "send", to, "-s", subject, "-m", body) + cmd.Dir = townRoot + _ = cmd.Run() // Best effort +} + +// updateAgentBeadState updates an agent bead's state. +func updateAgentBeadState(townRoot, agent, state, reason string) { + beadID, _, err := agentAddressToIDs(agent) + if err != nil { + return + } + + // Use bd agent state command + cmd := exec.Command("bd", "agent", "state", beadID, state) + cmd.Dir = townRoot + _ = cmd.Run() // Best effort +} + diff --git a/internal/deacon/stuck.go b/internal/deacon/stuck.go new file mode 100644 index 00000000..99473ecf --- /dev/null +++ b/internal/deacon/stuck.go @@ -0,0 +1,197 @@ +// Package deacon provides the Deacon agent infrastructure. +package deacon + +import ( + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + "time" +) + +// Default parameters for stuck-session detection. +const ( + DefaultPingTimeout = 30 * time.Second // How long to wait for response + DefaultConsecutiveFailures = 3 // Failures before force-kill + DefaultCooldown = 5 * time.Minute // Minimum time between force-kills +) + +// StuckConfig holds configurable parameters for stuck-session detection. +type StuckConfig struct { + PingTimeout time.Duration `json:"ping_timeout"` + ConsecutiveFailures int `json:"consecutive_failures"` + Cooldown time.Duration `json:"cooldown"` +} + +// DefaultStuckConfig returns the default stuck detection config. +func DefaultStuckConfig() *StuckConfig { + return &StuckConfig{ + PingTimeout: DefaultPingTimeout, + ConsecutiveFailures: DefaultConsecutiveFailures, + Cooldown: DefaultCooldown, + } +} + +// AgentHealthState tracks the health check state for a single agent. +type AgentHealthState struct { + // AgentID is the identifier (e.g., "gastown/polecats/max" or "deacon") + AgentID string `json:"agent_id"` + + // LastPingTime is when we last sent a HEALTH_CHECK nudge + LastPingTime time.Time `json:"last_ping_time,omitempty"` + + // LastResponseTime is when the agent last updated their activity + LastResponseTime time.Time `json:"last_response_time,omitempty"` + + // ConsecutiveFailures counts how many health checks failed in a row + ConsecutiveFailures int `json:"consecutive_failures"` + + // LastForceKillTime is when we last force-killed this agent + LastForceKillTime time.Time `json:"last_force_kill_time,omitempty"` + + // ForceKillCount is total number of force-kills for this agent + ForceKillCount int `json:"force_kill_count"` +} + +// HealthCheckState holds health check state for all monitored agents. +type HealthCheckState struct { + // Agents maps agent ID to their health state + Agents map[string]*AgentHealthState `json:"agents"` + + // LastUpdated is when this state was last written + LastUpdated time.Time `json:"last_updated"` +} + +// HealthCheckStateFile returns the path to the health check state file. +func HealthCheckStateFile(townRoot string) string { + return filepath.Join(townRoot, "deacon", "health-check-state.json") +} + +// LoadHealthCheckState loads the health check state from disk. +// Returns empty state if file doesn't exist. +func LoadHealthCheckState(townRoot string) (*HealthCheckState, error) { + stateFile := HealthCheckStateFile(townRoot) + + data, err := os.ReadFile(stateFile) + if err != nil { + if os.IsNotExist(err) { + // Return empty state + return &HealthCheckState{ + Agents: make(map[string]*AgentHealthState), + }, nil + } + return nil, fmt.Errorf("reading health check state: %w", err) + } + + var state HealthCheckState + if err := json.Unmarshal(data, &state); err != nil { + return nil, fmt.Errorf("parsing health check state: %w", err) + } + + if state.Agents == nil { + state.Agents = make(map[string]*AgentHealthState) + } + + return &state, nil +} + +// SaveHealthCheckState saves the health check state to disk. +func SaveHealthCheckState(townRoot string, state *HealthCheckState) error { + stateFile := HealthCheckStateFile(townRoot) + + // Ensure directory exists + if err := os.MkdirAll(filepath.Dir(stateFile), 0755); err != nil { + return fmt.Errorf("creating deacon directory: %w", err) + } + + state.LastUpdated = time.Now().UTC() + + data, err := json.MarshalIndent(state, "", " ") + if err != nil { + return fmt.Errorf("marshaling health check state: %w", err) + } + + return os.WriteFile(stateFile, data, 0644) +} + +// GetAgentState returns the health state for an agent, creating if needed. +func (s *HealthCheckState) GetAgentState(agentID string) *AgentHealthState { + if s.Agents == nil { + s.Agents = make(map[string]*AgentHealthState) + } + + state, ok := s.Agents[agentID] + if !ok { + state = &AgentHealthState{AgentID: agentID} + s.Agents[agentID] = state + } + return state +} + +// HealthCheckResult represents the outcome of a health check. +type HealthCheckResult struct { + AgentID string `json:"agent_id"` + Responded bool `json:"responded"` + ResponseTime time.Duration `json:"response_time,omitempty"` + ConsecutiveFailures int `json:"consecutive_failures"` + ShouldForceKill bool `json:"should_force_kill"` + InCooldown bool `json:"in_cooldown"` + CooldownRemaining time.Duration `json:"cooldown_remaining,omitempty"` +} + +// Common errors for stuck-session detection. +var ( + ErrAgentInCooldown = errors.New("agent is in cooldown period after recent force-kill") + ErrAgentNotFound = errors.New("agent not found or session doesn't exist") + ErrAgentResponsive = errors.New("agent is responsive, no action needed") +) + +// RecordPing records that a health check ping was sent to an agent. +func (s *AgentHealthState) RecordPing() { + s.LastPingTime = time.Now().UTC() +} + +// RecordResponse records that an agent responded to a health check. +// This resets the consecutive failure counter. +func (s *AgentHealthState) RecordResponse() { + s.LastResponseTime = time.Now().UTC() + s.ConsecutiveFailures = 0 +} + +// RecordFailure records that an agent failed to respond to a health check. +func (s *AgentHealthState) RecordFailure() { + s.ConsecutiveFailures++ +} + +// RecordForceKill records that an agent was force-killed. +func (s *AgentHealthState) RecordForceKill() { + s.LastForceKillTime = time.Now().UTC() + s.ForceKillCount++ + s.ConsecutiveFailures = 0 // Reset after kill +} + +// IsInCooldown returns true if the agent was recently force-killed. +func (s *AgentHealthState) IsInCooldown(cooldown time.Duration) bool { + if s.LastForceKillTime.IsZero() { + return false + } + return time.Since(s.LastForceKillTime) < cooldown +} + +// CooldownRemaining returns how long until cooldown expires. +func (s *AgentHealthState) CooldownRemaining(cooldown time.Duration) time.Duration { + if s.LastForceKillTime.IsZero() { + return 0 + } + remaining := cooldown - time.Since(s.LastForceKillTime) + if remaining < 0 { + return 0 + } + return remaining +} + +// ShouldForceKill returns true if the agent has exceeded the failure threshold. +func (s *AgentHealthState) ShouldForceKill(threshold int) bool { + return s.ConsecutiveFailures >= threshold +} diff --git a/internal/deacon/stuck_test.go b/internal/deacon/stuck_test.go new file mode 100644 index 00000000..7d930c80 --- /dev/null +++ b/internal/deacon/stuck_test.go @@ -0,0 +1,306 @@ +package deacon + +import ( + "os" + "path/filepath" + "testing" + "time" +) + +func TestDefaultStuckConfig(t *testing.T) { + config := DefaultStuckConfig() + + if config.PingTimeout != DefaultPingTimeout { + t.Errorf("PingTimeout = %v, want %v", config.PingTimeout, DefaultPingTimeout) + } + if config.ConsecutiveFailures != DefaultConsecutiveFailures { + t.Errorf("ConsecutiveFailures = %v, want %v", config.ConsecutiveFailures, DefaultConsecutiveFailures) + } + if config.Cooldown != DefaultCooldown { + t.Errorf("Cooldown = %v, want %v", config.Cooldown, DefaultCooldown) + } +} + +func TestHealthCheckStateFile(t *testing.T) { + path := HealthCheckStateFile("/tmp/test-town") + expected := "/tmp/test-town/deacon/health-check-state.json" + if path != expected { + t.Errorf("HealthCheckStateFile = %q, want %q", path, expected) + } +} + +func TestLoadHealthCheckState_NonExistent(t *testing.T) { + tmpDir := t.TempDir() + + state, err := LoadHealthCheckState(tmpDir) + if err != nil { + t.Fatalf("LoadHealthCheckState() error = %v", err) + } + if state.Agents == nil { + t.Error("Agents map should be initialized") + } + if len(state.Agents) != 0 { + t.Errorf("Expected empty agents map, got %d entries", len(state.Agents)) + } +} + +func TestSaveAndLoadHealthCheckState(t *testing.T) { + tmpDir := t.TempDir() + + // Create state with some data + state := &HealthCheckState{ + Agents: map[string]*AgentHealthState{ + "gastown/polecats/max": { + AgentID: "gastown/polecats/max", + ConsecutiveFailures: 2, + ForceKillCount: 1, + }, + }, + } + + // Save + if err := SaveHealthCheckState(tmpDir, state); err != nil { + t.Fatalf("SaveHealthCheckState() error = %v", err) + } + + // Verify file exists + stateFile := HealthCheckStateFile(tmpDir) + if _, err := os.Stat(stateFile); os.IsNotExist(err) { + t.Fatal("State file was not created") + } + + // Load + loaded, err := LoadHealthCheckState(tmpDir) + if err != nil { + t.Fatalf("LoadHealthCheckState() error = %v", err) + } + + // Verify loaded data + agent := loaded.Agents["gastown/polecats/max"] + if agent == nil { + t.Fatal("Agent not found in loaded state") + } + if agent.ConsecutiveFailures != 2 { + t.Errorf("ConsecutiveFailures = %d, want 2", agent.ConsecutiveFailures) + } + if agent.ForceKillCount != 1 { + t.Errorf("ForceKillCount = %d, want 1", agent.ForceKillCount) + } +} + +func TestGetAgentState(t *testing.T) { + state := &HealthCheckState{} + + // First call creates the agent + agent1 := state.GetAgentState("test/agent") + if agent1 == nil { + t.Fatal("GetAgentState returned nil") + } + if agent1.AgentID != "test/agent" { + t.Errorf("AgentID = %q, want %q", agent1.AgentID, "test/agent") + } + + // Second call returns same agent + agent2 := state.GetAgentState("test/agent") + if agent1 != agent2 { + t.Error("GetAgentState should return the same pointer") + } +} + +func TestAgentHealthState_RecordPing(t *testing.T) { + agent := &AgentHealthState{} + + before := time.Now() + agent.RecordPing() + after := time.Now() + + if agent.LastPingTime.Before(before) || agent.LastPingTime.After(after) { + t.Error("LastPingTime should be set to current time") + } +} + +func TestAgentHealthState_RecordResponse(t *testing.T) { + agent := &AgentHealthState{ + ConsecutiveFailures: 5, + } + + before := time.Now() + agent.RecordResponse() + after := time.Now() + + if agent.LastResponseTime.Before(before) || agent.LastResponseTime.After(after) { + t.Error("LastResponseTime should be set to current time") + } + if agent.ConsecutiveFailures != 0 { + t.Errorf("ConsecutiveFailures should be reset to 0, got %d", agent.ConsecutiveFailures) + } +} + +func TestAgentHealthState_RecordFailure(t *testing.T) { + agent := &AgentHealthState{ + ConsecutiveFailures: 2, + } + + agent.RecordFailure() + + if agent.ConsecutiveFailures != 3 { + t.Errorf("ConsecutiveFailures = %d, want 3", agent.ConsecutiveFailures) + } +} + +func TestAgentHealthState_RecordForceKill(t *testing.T) { + agent := &AgentHealthState{ + ConsecutiveFailures: 5, + ForceKillCount: 2, + } + + before := time.Now() + agent.RecordForceKill() + after := time.Now() + + if agent.LastForceKillTime.Before(before) || agent.LastForceKillTime.After(after) { + t.Error("LastForceKillTime should be set to current time") + } + if agent.ForceKillCount != 3 { + t.Errorf("ForceKillCount = %d, want 3", agent.ForceKillCount) + } + if agent.ConsecutiveFailures != 0 { + t.Errorf("ConsecutiveFailures should be reset to 0, got %d", agent.ConsecutiveFailures) + } +} + +func TestAgentHealthState_IsInCooldown(t *testing.T) { + cooldown := 5 * time.Minute + + tests := []struct { + name string + lastForceKillTime time.Time + want bool + }{ + { + name: "no force-kill history", + lastForceKillTime: time.Time{}, + want: false, + }, + { + name: "recently killed", + lastForceKillTime: time.Now().Add(-1 * time.Minute), + want: true, + }, + { + name: "cooldown expired", + lastForceKillTime: time.Now().Add(-10 * time.Minute), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agent := &AgentHealthState{ + LastForceKillTime: tt.lastForceKillTime, + } + if got := agent.IsInCooldown(cooldown); got != tt.want { + t.Errorf("IsInCooldown() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestAgentHealthState_CooldownRemaining(t *testing.T) { + cooldown := 5 * time.Minute + + tests := []struct { + name string + lastForceKillTime time.Time + wantZero bool + }{ + { + name: "no force-kill history", + lastForceKillTime: time.Time{}, + wantZero: true, + }, + { + name: "recently killed", + lastForceKillTime: time.Now().Add(-1 * time.Minute), + wantZero: false, + }, + { + name: "cooldown expired", + lastForceKillTime: time.Now().Add(-10 * time.Minute), + wantZero: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agent := &AgentHealthState{ + LastForceKillTime: tt.lastForceKillTime, + } + got := agent.CooldownRemaining(cooldown) + if tt.wantZero && got != 0 { + t.Errorf("CooldownRemaining() = %v, want 0", got) + } + if !tt.wantZero && got == 0 { + t.Error("CooldownRemaining() = 0, want non-zero") + } + }) + } +} + +func TestAgentHealthState_ShouldForceKill(t *testing.T) { + tests := []struct { + name string + failures int + threshold int + want bool + }{ + { + name: "below threshold", + failures: 2, + threshold: 3, + want: false, + }, + { + name: "at threshold", + failures: 3, + threshold: 3, + want: true, + }, + { + name: "above threshold", + failures: 5, + threshold: 3, + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agent := &AgentHealthState{ + ConsecutiveFailures: tt.failures, + } + if got := agent.ShouldForceKill(tt.threshold); got != tt.want { + t.Errorf("ShouldForceKill() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestSaveHealthCheckState_CreatesDirectory(t *testing.T) { + tmpDir := t.TempDir() + nestedDir := filepath.Join(tmpDir, "nonexistent", "deacon") + + state := &HealthCheckState{ + Agents: make(map[string]*AgentHealthState), + } + + // Should create the directory structure + if err := SaveHealthCheckState(filepath.Join(tmpDir, "nonexistent"), state); err != nil { + t.Fatalf("SaveHealthCheckState() error = %v", err) + } + + // Verify directory was created + if _, err := os.Stat(nestedDir); os.IsNotExist(err) { + t.Error("Directory should have been created") + } +}