Add Deacon stuck-session detection and force-kill protocol
Implements gt-l6ro3.4: Deacon can now detect and kill genuinely stuck/hung Claude Code sessions during health rounds. New commands: - gt deacon health-check <agent>: Pings agent, waits for response, tracks consecutive failures. Returns exit code 2 when force-kill threshold reached. - gt deacon force-kill <agent>: Kills tmux session, updates agent bead state, notifies mayor. Respects cooldown period. - gt deacon health-state: Shows health check state for all monitored agents. Detection protocol: 1. Send HEALTH_CHECK nudge to agent 2. Wait for agent bead update (configurable timeout, default 30s) 3. Track consecutive failures (default threshold: 3) 4. Recommend force-kill when threshold exceeded Force-kill protocol: 1. Log intervention (mail to agent) 2. Kill tmux session 3. Update agent bead state to "killed" 4. Notify mayor (optional) Configurable parameters: - --timeout: How long to wait for response (default 30s) - --failures: Consecutive failures before force-kill (default 3) - --cooldown: Minimum time between force-kills (default 5m) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
108afdbc52
commit
3dd18a1981
197
internal/deacon/stuck.go
Normal file
197
internal/deacon/stuck.go
Normal file
@@ -0,0 +1,197 @@
|
||||
// Package deacon provides the Deacon agent infrastructure.
|
||||
package deacon
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Default parameters for stuck-session detection.
|
||||
const (
|
||||
DefaultPingTimeout = 30 * time.Second // How long to wait for response
|
||||
DefaultConsecutiveFailures = 3 // Failures before force-kill
|
||||
DefaultCooldown = 5 * time.Minute // Minimum time between force-kills
|
||||
)
|
||||
|
||||
// StuckConfig holds configurable parameters for stuck-session detection.
|
||||
type StuckConfig struct {
|
||||
PingTimeout time.Duration `json:"ping_timeout"`
|
||||
ConsecutiveFailures int `json:"consecutive_failures"`
|
||||
Cooldown time.Duration `json:"cooldown"`
|
||||
}
|
||||
|
||||
// DefaultStuckConfig returns the default stuck detection config.
|
||||
func DefaultStuckConfig() *StuckConfig {
|
||||
return &StuckConfig{
|
||||
PingTimeout: DefaultPingTimeout,
|
||||
ConsecutiveFailures: DefaultConsecutiveFailures,
|
||||
Cooldown: DefaultCooldown,
|
||||
}
|
||||
}
|
||||
|
||||
// AgentHealthState tracks the health check state for a single agent.
|
||||
type AgentHealthState struct {
|
||||
// AgentID is the identifier (e.g., "gastown/polecats/max" or "deacon")
|
||||
AgentID string `json:"agent_id"`
|
||||
|
||||
// LastPingTime is when we last sent a HEALTH_CHECK nudge
|
||||
LastPingTime time.Time `json:"last_ping_time,omitempty"`
|
||||
|
||||
// LastResponseTime is when the agent last updated their activity
|
||||
LastResponseTime time.Time `json:"last_response_time,omitempty"`
|
||||
|
||||
// ConsecutiveFailures counts how many health checks failed in a row
|
||||
ConsecutiveFailures int `json:"consecutive_failures"`
|
||||
|
||||
// LastForceKillTime is when we last force-killed this agent
|
||||
LastForceKillTime time.Time `json:"last_force_kill_time,omitempty"`
|
||||
|
||||
// ForceKillCount is total number of force-kills for this agent
|
||||
ForceKillCount int `json:"force_kill_count"`
|
||||
}
|
||||
|
||||
// HealthCheckState holds health check state for all monitored agents.
|
||||
type HealthCheckState struct {
|
||||
// Agents maps agent ID to their health state
|
||||
Agents map[string]*AgentHealthState `json:"agents"`
|
||||
|
||||
// LastUpdated is when this state was last written
|
||||
LastUpdated time.Time `json:"last_updated"`
|
||||
}
|
||||
|
||||
// HealthCheckStateFile returns the path to the health check state file.
|
||||
func HealthCheckStateFile(townRoot string) string {
|
||||
return filepath.Join(townRoot, "deacon", "health-check-state.json")
|
||||
}
|
||||
|
||||
// LoadHealthCheckState loads the health check state from disk.
|
||||
// Returns empty state if file doesn't exist.
|
||||
func LoadHealthCheckState(townRoot string) (*HealthCheckState, error) {
|
||||
stateFile := HealthCheckStateFile(townRoot)
|
||||
|
||||
data, err := os.ReadFile(stateFile)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
// Return empty state
|
||||
return &HealthCheckState{
|
||||
Agents: make(map[string]*AgentHealthState),
|
||||
}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("reading health check state: %w", err)
|
||||
}
|
||||
|
||||
var state HealthCheckState
|
||||
if err := json.Unmarshal(data, &state); err != nil {
|
||||
return nil, fmt.Errorf("parsing health check state: %w", err)
|
||||
}
|
||||
|
||||
if state.Agents == nil {
|
||||
state.Agents = make(map[string]*AgentHealthState)
|
||||
}
|
||||
|
||||
return &state, nil
|
||||
}
|
||||
|
||||
// SaveHealthCheckState saves the health check state to disk.
|
||||
func SaveHealthCheckState(townRoot string, state *HealthCheckState) error {
|
||||
stateFile := HealthCheckStateFile(townRoot)
|
||||
|
||||
// Ensure directory exists
|
||||
if err := os.MkdirAll(filepath.Dir(stateFile), 0755); err != nil {
|
||||
return fmt.Errorf("creating deacon directory: %w", err)
|
||||
}
|
||||
|
||||
state.LastUpdated = time.Now().UTC()
|
||||
|
||||
data, err := json.MarshalIndent(state, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshaling health check state: %w", err)
|
||||
}
|
||||
|
||||
return os.WriteFile(stateFile, data, 0644)
|
||||
}
|
||||
|
||||
// GetAgentState returns the health state for an agent, creating if needed.
|
||||
func (s *HealthCheckState) GetAgentState(agentID string) *AgentHealthState {
|
||||
if s.Agents == nil {
|
||||
s.Agents = make(map[string]*AgentHealthState)
|
||||
}
|
||||
|
||||
state, ok := s.Agents[agentID]
|
||||
if !ok {
|
||||
state = &AgentHealthState{AgentID: agentID}
|
||||
s.Agents[agentID] = state
|
||||
}
|
||||
return state
|
||||
}
|
||||
|
||||
// HealthCheckResult represents the outcome of a health check.
|
||||
type HealthCheckResult struct {
|
||||
AgentID string `json:"agent_id"`
|
||||
Responded bool `json:"responded"`
|
||||
ResponseTime time.Duration `json:"response_time,omitempty"`
|
||||
ConsecutiveFailures int `json:"consecutive_failures"`
|
||||
ShouldForceKill bool `json:"should_force_kill"`
|
||||
InCooldown bool `json:"in_cooldown"`
|
||||
CooldownRemaining time.Duration `json:"cooldown_remaining,omitempty"`
|
||||
}
|
||||
|
||||
// Common errors for stuck-session detection.
|
||||
var (
|
||||
ErrAgentInCooldown = errors.New("agent is in cooldown period after recent force-kill")
|
||||
ErrAgentNotFound = errors.New("agent not found or session doesn't exist")
|
||||
ErrAgentResponsive = errors.New("agent is responsive, no action needed")
|
||||
)
|
||||
|
||||
// RecordPing records that a health check ping was sent to an agent.
|
||||
func (s *AgentHealthState) RecordPing() {
|
||||
s.LastPingTime = time.Now().UTC()
|
||||
}
|
||||
|
||||
// RecordResponse records that an agent responded to a health check.
|
||||
// This resets the consecutive failure counter.
|
||||
func (s *AgentHealthState) RecordResponse() {
|
||||
s.LastResponseTime = time.Now().UTC()
|
||||
s.ConsecutiveFailures = 0
|
||||
}
|
||||
|
||||
// RecordFailure records that an agent failed to respond to a health check.
|
||||
func (s *AgentHealthState) RecordFailure() {
|
||||
s.ConsecutiveFailures++
|
||||
}
|
||||
|
||||
// RecordForceKill records that an agent was force-killed.
|
||||
func (s *AgentHealthState) RecordForceKill() {
|
||||
s.LastForceKillTime = time.Now().UTC()
|
||||
s.ForceKillCount++
|
||||
s.ConsecutiveFailures = 0 // Reset after kill
|
||||
}
|
||||
|
||||
// IsInCooldown returns true if the agent was recently force-killed.
|
||||
func (s *AgentHealthState) IsInCooldown(cooldown time.Duration) bool {
|
||||
if s.LastForceKillTime.IsZero() {
|
||||
return false
|
||||
}
|
||||
return time.Since(s.LastForceKillTime) < cooldown
|
||||
}
|
||||
|
||||
// CooldownRemaining returns how long until cooldown expires.
|
||||
func (s *AgentHealthState) CooldownRemaining(cooldown time.Duration) time.Duration {
|
||||
if s.LastForceKillTime.IsZero() {
|
||||
return 0
|
||||
}
|
||||
remaining := cooldown - time.Since(s.LastForceKillTime)
|
||||
if remaining < 0 {
|
||||
return 0
|
||||
}
|
||||
return remaining
|
||||
}
|
||||
|
||||
// ShouldForceKill returns true if the agent has exceeded the failure threshold.
|
||||
func (s *AgentHealthState) ShouldForceKill(threshold int) bool {
|
||||
return s.ConsecutiveFailures >= threshold
|
||||
}
|
||||
306
internal/deacon/stuck_test.go
Normal file
306
internal/deacon/stuck_test.go
Normal file
@@ -0,0 +1,306 @@
|
||||
package deacon
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestDefaultStuckConfig(t *testing.T) {
|
||||
config := DefaultStuckConfig()
|
||||
|
||||
if config.PingTimeout != DefaultPingTimeout {
|
||||
t.Errorf("PingTimeout = %v, want %v", config.PingTimeout, DefaultPingTimeout)
|
||||
}
|
||||
if config.ConsecutiveFailures != DefaultConsecutiveFailures {
|
||||
t.Errorf("ConsecutiveFailures = %v, want %v", config.ConsecutiveFailures, DefaultConsecutiveFailures)
|
||||
}
|
||||
if config.Cooldown != DefaultCooldown {
|
||||
t.Errorf("Cooldown = %v, want %v", config.Cooldown, DefaultCooldown)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealthCheckStateFile(t *testing.T) {
|
||||
path := HealthCheckStateFile("/tmp/test-town")
|
||||
expected := "/tmp/test-town/deacon/health-check-state.json"
|
||||
if path != expected {
|
||||
t.Errorf("HealthCheckStateFile = %q, want %q", path, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadHealthCheckState_NonExistent(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
|
||||
state, err := LoadHealthCheckState(tmpDir)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadHealthCheckState() error = %v", err)
|
||||
}
|
||||
if state.Agents == nil {
|
||||
t.Error("Agents map should be initialized")
|
||||
}
|
||||
if len(state.Agents) != 0 {
|
||||
t.Errorf("Expected empty agents map, got %d entries", len(state.Agents))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveAndLoadHealthCheckState(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
|
||||
// Create state with some data
|
||||
state := &HealthCheckState{
|
||||
Agents: map[string]*AgentHealthState{
|
||||
"gastown/polecats/max": {
|
||||
AgentID: "gastown/polecats/max",
|
||||
ConsecutiveFailures: 2,
|
||||
ForceKillCount: 1,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Save
|
||||
if err := SaveHealthCheckState(tmpDir, state); err != nil {
|
||||
t.Fatalf("SaveHealthCheckState() error = %v", err)
|
||||
}
|
||||
|
||||
// Verify file exists
|
||||
stateFile := HealthCheckStateFile(tmpDir)
|
||||
if _, err := os.Stat(stateFile); os.IsNotExist(err) {
|
||||
t.Fatal("State file was not created")
|
||||
}
|
||||
|
||||
// Load
|
||||
loaded, err := LoadHealthCheckState(tmpDir)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadHealthCheckState() error = %v", err)
|
||||
}
|
||||
|
||||
// Verify loaded data
|
||||
agent := loaded.Agents["gastown/polecats/max"]
|
||||
if agent == nil {
|
||||
t.Fatal("Agent not found in loaded state")
|
||||
}
|
||||
if agent.ConsecutiveFailures != 2 {
|
||||
t.Errorf("ConsecutiveFailures = %d, want 2", agent.ConsecutiveFailures)
|
||||
}
|
||||
if agent.ForceKillCount != 1 {
|
||||
t.Errorf("ForceKillCount = %d, want 1", agent.ForceKillCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetAgentState(t *testing.T) {
|
||||
state := &HealthCheckState{}
|
||||
|
||||
// First call creates the agent
|
||||
agent1 := state.GetAgentState("test/agent")
|
||||
if agent1 == nil {
|
||||
t.Fatal("GetAgentState returned nil")
|
||||
}
|
||||
if agent1.AgentID != "test/agent" {
|
||||
t.Errorf("AgentID = %q, want %q", agent1.AgentID, "test/agent")
|
||||
}
|
||||
|
||||
// Second call returns same agent
|
||||
agent2 := state.GetAgentState("test/agent")
|
||||
if agent1 != agent2 {
|
||||
t.Error("GetAgentState should return the same pointer")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentHealthState_RecordPing(t *testing.T) {
|
||||
agent := &AgentHealthState{}
|
||||
|
||||
before := time.Now()
|
||||
agent.RecordPing()
|
||||
after := time.Now()
|
||||
|
||||
if agent.LastPingTime.Before(before) || agent.LastPingTime.After(after) {
|
||||
t.Error("LastPingTime should be set to current time")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentHealthState_RecordResponse(t *testing.T) {
|
||||
agent := &AgentHealthState{
|
||||
ConsecutiveFailures: 5,
|
||||
}
|
||||
|
||||
before := time.Now()
|
||||
agent.RecordResponse()
|
||||
after := time.Now()
|
||||
|
||||
if agent.LastResponseTime.Before(before) || agent.LastResponseTime.After(after) {
|
||||
t.Error("LastResponseTime should be set to current time")
|
||||
}
|
||||
if agent.ConsecutiveFailures != 0 {
|
||||
t.Errorf("ConsecutiveFailures should be reset to 0, got %d", agent.ConsecutiveFailures)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentHealthState_RecordFailure(t *testing.T) {
|
||||
agent := &AgentHealthState{
|
||||
ConsecutiveFailures: 2,
|
||||
}
|
||||
|
||||
agent.RecordFailure()
|
||||
|
||||
if agent.ConsecutiveFailures != 3 {
|
||||
t.Errorf("ConsecutiveFailures = %d, want 3", agent.ConsecutiveFailures)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentHealthState_RecordForceKill(t *testing.T) {
|
||||
agent := &AgentHealthState{
|
||||
ConsecutiveFailures: 5,
|
||||
ForceKillCount: 2,
|
||||
}
|
||||
|
||||
before := time.Now()
|
||||
agent.RecordForceKill()
|
||||
after := time.Now()
|
||||
|
||||
if agent.LastForceKillTime.Before(before) || agent.LastForceKillTime.After(after) {
|
||||
t.Error("LastForceKillTime should be set to current time")
|
||||
}
|
||||
if agent.ForceKillCount != 3 {
|
||||
t.Errorf("ForceKillCount = %d, want 3", agent.ForceKillCount)
|
||||
}
|
||||
if agent.ConsecutiveFailures != 0 {
|
||||
t.Errorf("ConsecutiveFailures should be reset to 0, got %d", agent.ConsecutiveFailures)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentHealthState_IsInCooldown(t *testing.T) {
|
||||
cooldown := 5 * time.Minute
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
lastForceKillTime time.Time
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "no force-kill history",
|
||||
lastForceKillTime: time.Time{},
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "recently killed",
|
||||
lastForceKillTime: time.Now().Add(-1 * time.Minute),
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "cooldown expired",
|
||||
lastForceKillTime: time.Now().Add(-10 * time.Minute),
|
||||
want: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
agent := &AgentHealthState{
|
||||
LastForceKillTime: tt.lastForceKillTime,
|
||||
}
|
||||
if got := agent.IsInCooldown(cooldown); got != tt.want {
|
||||
t.Errorf("IsInCooldown() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentHealthState_CooldownRemaining(t *testing.T) {
|
||||
cooldown := 5 * time.Minute
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
lastForceKillTime time.Time
|
||||
wantZero bool
|
||||
}{
|
||||
{
|
||||
name: "no force-kill history",
|
||||
lastForceKillTime: time.Time{},
|
||||
wantZero: true,
|
||||
},
|
||||
{
|
||||
name: "recently killed",
|
||||
lastForceKillTime: time.Now().Add(-1 * time.Minute),
|
||||
wantZero: false,
|
||||
},
|
||||
{
|
||||
name: "cooldown expired",
|
||||
lastForceKillTime: time.Now().Add(-10 * time.Minute),
|
||||
wantZero: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
agent := &AgentHealthState{
|
||||
LastForceKillTime: tt.lastForceKillTime,
|
||||
}
|
||||
got := agent.CooldownRemaining(cooldown)
|
||||
if tt.wantZero && got != 0 {
|
||||
t.Errorf("CooldownRemaining() = %v, want 0", got)
|
||||
}
|
||||
if !tt.wantZero && got == 0 {
|
||||
t.Error("CooldownRemaining() = 0, want non-zero")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentHealthState_ShouldForceKill(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
failures int
|
||||
threshold int
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "below threshold",
|
||||
failures: 2,
|
||||
threshold: 3,
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "at threshold",
|
||||
failures: 3,
|
||||
threshold: 3,
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "above threshold",
|
||||
failures: 5,
|
||||
threshold: 3,
|
||||
want: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
agent := &AgentHealthState{
|
||||
ConsecutiveFailures: tt.failures,
|
||||
}
|
||||
if got := agent.ShouldForceKill(tt.threshold); got != tt.want {
|
||||
t.Errorf("ShouldForceKill() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSaveHealthCheckState_CreatesDirectory(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
nestedDir := filepath.Join(tmpDir, "nonexistent", "deacon")
|
||||
|
||||
state := &HealthCheckState{
|
||||
Agents: make(map[string]*AgentHealthState),
|
||||
}
|
||||
|
||||
// Should create the directory structure
|
||||
if err := SaveHealthCheckState(filepath.Join(tmpDir, "nonexistent"), state); err != nil {
|
||||
t.Fatalf("SaveHealthCheckState() error = %v", err)
|
||||
}
|
||||
|
||||
// Verify directory was created
|
||||
if _, err := os.Stat(nestedDir); os.IsNotExist(err) {
|
||||
t.Error("Directory should have been created")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user