gastown/internal/deacon/stuck.go

// Package deacon provides the Deacon agent infrastructure.
package deacon

import (
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"time"

	"github.com/steveyegge/gastown/internal/beads"
)

// Default parameters for stuck-session detection.
// These are fallbacks when no role bead config exists.
// Per ZFC: "Let agents decide thresholds. 'Stuck' is a judgment call."
const (
	DefaultPingTimeout         = 30 * time.Second // How long to wait for response
	DefaultConsecutiveFailures = 3                // Failures before force-kill
	DefaultCooldown            = 5 * time.Minute  // Minimum time between force-kills
)

// StuckConfig holds configurable parameters for stuck-session detection.
type StuckConfig struct {
	PingTimeout         time.Duration `json:"ping_timeout"`
	ConsecutiveFailures int           `json:"consecutive_failures"`
	Cooldown            time.Duration `json:"cooldown"`
}

// DefaultStuckConfig returns the default stuck detection config.
func DefaultStuckConfig() *StuckConfig {
	return &StuckConfig{
		PingTimeout:         DefaultPingTimeout,
		ConsecutiveFailures: DefaultConsecutiveFailures,
		Cooldown:            DefaultCooldown,
	}
}

// LoadStuckConfig loads stuck detection config from the Deacon's role bead.
// Returns defaults if no role bead exists or if fields aren't configured.
// Per ZFC: agents control their own thresholds via their role beads.
func LoadStuckConfig(townRoot string) *StuckConfig {
	config := DefaultStuckConfig()

	// Load from hq-deacon-role bead
	bd := beads.NewWithBeadsDir(townRoot, beads.ResolveBeadsDir(townRoot))
	roleConfig, err := bd.GetRoleConfig(beads.RoleBeadIDTown("deacon"))
	if err != nil || roleConfig == nil {
		return config
	}

	// Override defaults with role bead values
	if roleConfig.PingTimeout != "" {
		if d, err := time.ParseDuration(roleConfig.PingTimeout); err == nil {
			config.PingTimeout = d
		}
	}
	if roleConfig.ConsecutiveFailures > 0 {
		config.ConsecutiveFailures = roleConfig.ConsecutiveFailures
	}
	if roleConfig.KillCooldown != "" {
		if d, err := time.ParseDuration(roleConfig.KillCooldown); err == nil {
			config.Cooldown = d
		}
	}

	return config
}

// AgentHealthState tracks the health check state for a single agent.
type AgentHealthState struct {
	// AgentID is the identifier (e.g., "gastown/polecats/max" or "deacon")
	AgentID string `json:"agent_id"`

	// LastPingTime is when we last sent a HEALTH_CHECK nudge
	LastPingTime time.Time `json:"last_ping_time,omitempty"`

	// LastResponseTime is when the agent last updated their activity
	LastResponseTime time.Time `json:"last_response_time,omitempty"`

	// ConsecutiveFailures counts how many health checks failed in a row
	ConsecutiveFailures int `json:"consecutive_failures"`

	// LastForceKillTime is when we last force-killed this agent
	LastForceKillTime time.Time `json:"last_force_kill_time,omitempty"`

	// ForceKillCount is total number of force-kills for this agent
	ForceKillCount int `json:"force_kill_count"`
}

// HealthCheckState holds health check state for all monitored agents.
type HealthCheckState struct {
	// Agents maps agent ID to their health state
	Agents map[string]*AgentHealthState `json:"agents"`

	// LastUpdated is when this state was last written
	LastUpdated time.Time `json:"last_updated"`
}

// HealthCheckStateFile returns the path to the health check state file.
func HealthCheckStateFile(townRoot string) string {
	return filepath.Join(townRoot, "deacon", "health-check-state.json")
}

// LoadHealthCheckState loads the health check state from disk.
// Returns empty state if file doesn't exist.
func LoadHealthCheckState(townRoot string) (*HealthCheckState, error) {
	stateFile := HealthCheckStateFile(townRoot)

	data, err := os.ReadFile(stateFile) //nolint:gosec // G304: path is constructed from trusted townRoot
	if err != nil {
		if os.IsNotExist(err) {
			// Return empty state
			return &HealthCheckState{
				Agents: make(map[string]*AgentHealthState),
			}, nil
		}
		return nil, fmt.Errorf("reading health check state: %w", err)
	}

	var state HealthCheckState
	if err := json.Unmarshal(data, &state); err != nil {
		return nil, fmt.Errorf("parsing health check state: %w", err)
	}

	if state.Agents == nil {
		state.Agents = make(map[string]*AgentHealthState)
	}

	return &state, nil
}

// SaveHealthCheckState saves the health check state to disk.
func SaveHealthCheckState(townRoot string, state *HealthCheckState) error {
	stateFile := HealthCheckStateFile(townRoot)

	// Ensure directory exists
	if err := os.MkdirAll(filepath.Dir(stateFile), 0755); err != nil {
		return fmt.Errorf("creating deacon directory: %w", err)
	}

	state.LastUpdated = time.Now().UTC()

	data, err := json.MarshalIndent(state, "", "  ")
	if err != nil {
		return fmt.Errorf("marshaling health check state: %w", err)
	}

	return os.WriteFile(stateFile, data, 0600)
}

// GetAgentState returns the health state for an agent, creating if needed.
func (s *HealthCheckState) GetAgentState(agentID string) *AgentHealthState {
	if s.Agents == nil {
		s.Agents = make(map[string]*AgentHealthState)
	}

	state, ok := s.Agents[agentID]
	if !ok {
		state = &AgentHealthState{AgentID: agentID}
		s.Agents[agentID] = state
	}
	return state
}

// HealthCheckResult represents the outcome of a health check.
type HealthCheckResult struct {
	AgentID             string        `json:"agent_id"`
	Responded           bool          `json:"responded"`
	ResponseTime        time.Duration `json:"response_time,omitempty"`
	ConsecutiveFailures int           `json:"consecutive_failures"`
	ShouldForceKill     bool          `json:"should_force_kill"`
	InCooldown          bool          `json:"in_cooldown"`
	CooldownRemaining   time.Duration `json:"cooldown_remaining,omitempty"`
}

// Common errors for stuck-session detection.
var (
	ErrAgentInCooldown  = errors.New("agent is in cooldown period after recent force-kill")
	ErrAgentNotFound    = errors.New("agent not found or session doesn't exist")
	ErrAgentResponsive  = errors.New("agent is responsive, no action needed")
)

// RecordPing records that a health check ping was sent to an agent.
func (s *AgentHealthState) RecordPing() {
	s.LastPingTime = time.Now().UTC()
}

// RecordResponse records that an agent responded to a health check.
// This resets the consecutive failure counter.
func (s *AgentHealthState) RecordResponse() {
	s.LastResponseTime = time.Now().UTC()
	s.ConsecutiveFailures = 0
}

// RecordFailure records that an agent failed to respond to a health check.
func (s *AgentHealthState) RecordFailure() {
	s.ConsecutiveFailures++
}

// RecordForceKill records that an agent was force-killed.
func (s *AgentHealthState) RecordForceKill() {
	s.LastForceKillTime = time.Now().UTC()
	s.ForceKillCount++
	s.ConsecutiveFailures = 0 // Reset after kill
}

// IsInCooldown returns true if the agent was recently force-killed.
func (s *AgentHealthState) IsInCooldown(cooldown time.Duration) bool {
	if s.LastForceKillTime.IsZero() {
		return false
	}
	return time.Since(s.LastForceKillTime) < cooldown
}

// CooldownRemaining returns how long until cooldown expires.
func (s *AgentHealthState) CooldownRemaining(cooldown time.Duration) time.Duration {
	if s.LastForceKillTime.IsZero() {
		return 0
	}
	remaining := cooldown - time.Since(s.LastForceKillTime)
	if remaining < 0 {
		return 0
	}
	return remaining
}

// ShouldForceKill returns true if the agent has exceeded the failure threshold.
func (s *AgentHealthState) ShouldForceKill(threshold int) bool {
	return s.ConsecutiveFailures >= threshold
}