Files
gastown/internal/cmd/deacon.go
Ben Kraus 38adfa4d8b codex
2026-01-08 12:36:54 -05:00

1091 lines
34 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package cmd
import (
"encoding/json"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/spf13/cobra"
"github.com/steveyegge/gastown/internal/beads"
"github.com/steveyegge/gastown/internal/claude"
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/deacon"
"github.com/steveyegge/gastown/internal/polecat"
"github.com/steveyegge/gastown/internal/runtime"
"github.com/steveyegge/gastown/internal/session"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/workspace"
)
// getDeaconSessionName returns the Deacon session name.
func getDeaconSessionName() string {
return session.DeaconSessionName()
}
var deaconCmd = &cobra.Command{
Use: "deacon",
Aliases: []string{"dea"},
GroupID: GroupAgents,
Short: "Manage the Deacon session",
RunE: requireSubcommand,
Long: `Manage the Deacon tmux session.
The Deacon is the hierarchical health-check orchestrator for Gas Town.
It monitors the Mayor and Witnesses, handles lifecycle requests, and
keeps the town running. Use the subcommands to start, stop, attach,
and check status.`,
}
var deaconStartCmd = &cobra.Command{
Use: "start",
Aliases: []string{"spawn"},
Short: "Start the Deacon session",
Long: `Start the Deacon tmux session.
Creates a new detached tmux session for the Deacon and launches Claude.
The session runs in the workspace root directory.`,
RunE: runDeaconStart,
}
var deaconStopCmd = &cobra.Command{
Use: "stop",
Short: "Stop the Deacon session",
Long: `Stop the Deacon tmux session.
Attempts graceful shutdown first (Ctrl-C), then kills the tmux session.`,
RunE: runDeaconStop,
}
var deaconAttachCmd = &cobra.Command{
Use: "attach",
Aliases: []string{"at"},
Short: "Attach to the Deacon session",
Long: `Attach to the running Deacon tmux session.
Attaches the current terminal to the Deacon's tmux session.
Detach with Ctrl-B D.`,
RunE: runDeaconAttach,
}
var deaconStatusCmd = &cobra.Command{
Use: "status",
Short: "Check Deacon session status",
Long: `Check if the Deacon tmux session is currently running.`,
RunE: runDeaconStatus,
}
var deaconRestartCmd = &cobra.Command{
Use: "restart",
Short: "Restart the Deacon session",
Long: `Restart the Deacon tmux session.
Stops the current session (if running) and starts a fresh one.`,
RunE: runDeaconRestart,
}
var deaconAgentOverride string
var deaconHeartbeatCmd = &cobra.Command{
Use: "heartbeat [action]",
Short: "Update the Deacon heartbeat",
Long: `Update the Deacon heartbeat file.
The heartbeat signals to the daemon that the Deacon is alive and working.
Call this at the start of each wake cycle to prevent daemon pokes.
Examples:
gt deacon heartbeat # Touch heartbeat with timestamp
gt deacon heartbeat "checking mayor" # Touch with action description`,
RunE: runDeaconHeartbeat,
}
var deaconTriggerPendingCmd = &cobra.Command{
Use: "trigger-pending",
Short: "Trigger pending polecat spawns (bootstrap mode)",
Long: `Check inbox for POLECAT_STARTED messages and trigger ready polecats.
⚠️ BOOTSTRAP MODE ONLY - Uses regex detection (ZFC violation acceptable).
This command uses WaitForRuntimeReady (regex) to detect when the runtime is ready.
This is appropriate for daemon bootstrap when no AI is available.
In steady-state, the Deacon should use AI-based observation instead:
gt deacon pending # View pending spawns with captured output
gt peek <session> # Observe session output (AI analyzes)
gt nudge <session> # Trigger when AI determines ready
This command is typically called by the daemon during cold startup.`,
RunE: runDeaconTriggerPending,
}
var deaconHealthCheckCmd = &cobra.Command{
Use: "health-check <agent>",
Short: "Send a health check ping to an agent and track response",
Long: `Send a HEALTH_CHECK nudge to an agent and wait for response.
This command is used by the Deacon during health rounds to detect stuck sessions.
It tracks consecutive failures and determines when force-kill is warranted.
The detection protocol:
1. Send HEALTH_CHECK nudge to the agent
2. Wait for agent to update their bead (configurable timeout, default 30s)
3. If no activity update, increment failure counter
4. After N consecutive failures (default 3), recommend force-kill
Exit codes:
0 - Agent responded or is in cooldown (no action needed)
1 - Error occurred
2 - Agent should be force-killed (consecutive failures exceeded)
Examples:
gt deacon health-check gastown/polecats/max
gt deacon health-check gastown/witness --timeout=60s
gt deacon health-check deacon --failures=5`,
Args: cobra.ExactArgs(1),
RunE: runDeaconHealthCheck,
}
var deaconForceKillCmd = &cobra.Command{
Use: "force-kill <agent>",
Short: "Force-kill an unresponsive agent session",
Long: `Force-kill an agent session that has been detected as stuck.
This command is used by the Deacon when an agent fails consecutive health checks.
It performs the force-kill protocol:
1. Log the intervention (send mail to agent)
2. Kill the tmux session
3. Update agent bead state to "killed"
4. Notify mayor (optional, for visibility)
After force-kill, the agent is 'asleep'. Normal wake mechanisms apply:
- gt rig boot restarts it
- Or stays asleep until next activity trigger
This respects the cooldown period - won't kill if recently killed.
Examples:
gt deacon force-kill gastown/polecats/max
gt deacon force-kill gastown/witness --reason="unresponsive for 90s"`,
Args: cobra.ExactArgs(1),
RunE: runDeaconForceKill,
}
var deaconHealthStateCmd = &cobra.Command{
Use: "health-state",
Short: "Show health check state for all monitored agents",
Long: `Display the current health check state including:
- Consecutive failure counts
- Last ping and response times
- Force-kill history and cooldowns
This helps the Deacon understand which agents may need attention.`,
RunE: runDeaconHealthState,
}
var deaconStaleHooksCmd = &cobra.Command{
Use: "stale-hooks",
Short: "Find and unhook stale hooked beads",
Long: `Find beads stuck in 'hooked' status and unhook them if the agent is gone.
Beads can get stuck in 'hooked' status when agents die or abandon work.
This command finds hooked beads older than the threshold (default: 1 hour),
checks if the assignee agent is still alive, and unhooks them if not.
Examples:
gt deacon stale-hooks # Find and unhook stale beads
gt deacon stale-hooks --dry-run # Preview what would be unhooked
gt deacon stale-hooks --max-age=30m # Use 30 minute threshold`,
RunE: runDeaconStaleHooks,
}
var deaconPauseCmd = &cobra.Command{
Use: "pause",
Short: "Pause the Deacon to prevent patrol actions",
Long: `Pause the Deacon to prevent it from performing any patrol actions.
When paused, the Deacon:
- Will not create patrol molecules
- Will not run health checks
- Will not take any autonomous actions
- Will display a PAUSED message on startup
The pause state persists across session restarts. Use 'gt deacon resume'
to allow the Deacon to work again.
Examples:
gt deacon pause # Pause with no reason
gt deacon pause --reason="testing" # Pause with a reason`,
RunE: runDeaconPause,
}
var deaconResumeCmd = &cobra.Command{
Use: "resume",
Short: "Resume the Deacon to allow patrol actions",
Long: `Resume the Deacon so it can perform patrol actions again.
This removes the pause file and allows the Deacon to work normally.`,
RunE: runDeaconResume,
}
var (
triggerTimeout time.Duration
// Health check flags
healthCheckTimeout time.Duration
healthCheckFailures int
healthCheckCooldown time.Duration
// Force kill flags
forceKillReason string
forceKillSkipNotify bool
// Stale hooks flags
staleHooksMaxAge time.Duration
staleHooksDryRun bool
// Pause flags
pauseReason string
)
func init() {
deaconCmd.AddCommand(deaconStartCmd)
deaconCmd.AddCommand(deaconStopCmd)
deaconCmd.AddCommand(deaconAttachCmd)
deaconCmd.AddCommand(deaconStatusCmd)
deaconCmd.AddCommand(deaconRestartCmd)
deaconCmd.AddCommand(deaconHeartbeatCmd)
deaconCmd.AddCommand(deaconTriggerPendingCmd)
deaconCmd.AddCommand(deaconHealthCheckCmd)
deaconCmd.AddCommand(deaconForceKillCmd)
deaconCmd.AddCommand(deaconHealthStateCmd)
deaconCmd.AddCommand(deaconStaleHooksCmd)
deaconCmd.AddCommand(deaconPauseCmd)
deaconCmd.AddCommand(deaconResumeCmd)
// Flags for trigger-pending
deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second,
"Timeout for checking if Claude is ready")
// Flags for health-check
deaconHealthCheckCmd.Flags().DurationVar(&healthCheckTimeout, "timeout", 30*time.Second,
"How long to wait for agent response")
deaconHealthCheckCmd.Flags().IntVar(&healthCheckFailures, "failures", 3,
"Number of consecutive failures before recommending force-kill")
deaconHealthCheckCmd.Flags().DurationVar(&healthCheckCooldown, "cooldown", 5*time.Minute,
"Minimum time between force-kills of same agent")
// Flags for force-kill
deaconForceKillCmd.Flags().StringVar(&forceKillReason, "reason", "",
"Reason for force-kill (included in notifications)")
deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false,
"Skip sending notification mail to mayor")
// Flags for stale-hooks
deaconStaleHooksCmd.Flags().DurationVar(&staleHooksMaxAge, "max-age", 1*time.Hour,
"Maximum age before a hooked bead is considered stale")
deaconStaleHooksCmd.Flags().BoolVar(&staleHooksDryRun, "dry-run", false,
"Preview what would be unhooked without making changes")
// Flags for pause
deaconPauseCmd.Flags().StringVar(&pauseReason, "reason", "",
"Reason for pausing the Deacon")
deaconStartCmd.Flags().StringVar(&deaconAgentOverride, "agent", "", "Agent alias to run the Deacon with (overrides town default)")
deaconAttachCmd.Flags().StringVar(&deaconAgentOverride, "agent", "", "Agent alias to run the Deacon with (overrides town default)")
deaconRestartCmd.Flags().StringVar(&deaconAgentOverride, "agent", "", "Agent alias to run the Deacon with (overrides town default)")
rootCmd.AddCommand(deaconCmd)
}
func runDeaconStart(cmd *cobra.Command, args []string) error {
t := tmux.NewTmux()
sessionName := getDeaconSessionName()
// Check if session already exists
running, err := t.HasSession(sessionName)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if running {
return fmt.Errorf("Deacon session already running. Attach with: gt deacon attach")
}
if err := startDeaconSession(t, sessionName, deaconAgentOverride); err != nil {
return err
}
fmt.Printf("%s Deacon session started. Attach with: %s\n",
style.Bold.Render("✓"),
style.Dim.Render("gt deacon attach"))
return nil
}
// startDeaconSession creates and initializes the Deacon tmux session.
func startDeaconSession(t *tmux.Tmux, sessionName, agentOverride string) error {
// Find workspace root
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Deacon runs from its own directory (for correct role detection by gt prime)
deaconDir := filepath.Join(townRoot, "deacon")
// Ensure deacon directory exists
if err := os.MkdirAll(deaconDir, 0755); err != nil {
return fmt.Errorf("creating deacon directory: %w", err)
}
// Ensure Claude settings exist (autonomous role needs mail in SessionStart)
if err := claude.EnsureSettingsForRole(deaconDir, "deacon"); err != nil {
style.PrintWarning("Could not create deacon settings: %v", err)
}
// Create session in deacon directory
fmt.Println("Starting Deacon session...")
if err := t.NewSession(sessionName, deaconDir); err != nil {
return fmt.Errorf("creating session: %w", err)
}
// Set environment (non-fatal: session works without these)
_ = t.SetEnvironment(sessionName, "GT_ROLE", "deacon")
_ = t.SetEnvironment(sessionName, "BD_ACTOR", "deacon")
// Apply Deacon theme (non-fatal: theming failure doesn't affect operation)
// Note: ConfigureGasTownSession includes cycle bindings
theme := tmux.DeaconTheme()
_ = t.ConfigureGasTownSession(sessionName, theme, "", "Deacon", "health-check")
// Launch Claude directly (no shell respawn loop)
// Restarts are handled by daemon via ensureDeaconRunning on each heartbeat
// The startup hook handles context loading automatically
// Export GT_ROLE and BD_ACTOR in the command since tmux SetEnvironment only affects new panes
startupCmd, err := config.BuildAgentStartupCommandWithAgentOverride("deacon", "deacon", "", "", agentOverride)
if err != nil {
return fmt.Errorf("building startup command: %w", err)
}
if err := t.SendKeys(sessionName, startupCmd); err != nil {
return fmt.Errorf("sending command: %w", err)
}
// Wait for Claude to start (non-fatal)
if err := t.WaitForCommand(sessionName, constants.SupportedShells, constants.ClaudeStartTimeout); err != nil {
// Non-fatal
}
time.Sleep(constants.ShutdownNotifyDelay)
runtimeConfig := config.LoadRuntimeConfig("")
_ = runtime.RunStartupFallback(t, sessionName, "deacon", runtimeConfig)
// Inject startup nudge for predecessor discovery via /resume
_ = session.StartupNudge(t, sessionName, session.StartupNudgeConfig{
Recipient: "deacon",
Sender: "daemon",
Topic: "patrol",
}) // Non-fatal
// GUPP: Gas Town Universal Propulsion Principle
// Send the propulsion nudge to trigger autonomous patrol execution.
// Wait for beacon to be fully processed (needs to be separate prompt)
time.Sleep(2 * time.Second)
_ = t.NudgeSession(sessionName, session.PropulsionNudgeForRole("deacon", deaconDir)) // Non-fatal
return nil
}
func runDeaconStop(cmd *cobra.Command, args []string) error {
t := tmux.NewTmux()
sessionName := getDeaconSessionName()
// Check if session exists
running, err := t.HasSession(sessionName)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if !running {
return errors.New("Deacon session is not running")
}
fmt.Println("Stopping Deacon session...")
// Try graceful shutdown first (best-effort interrupt)
_ = t.SendKeysRaw(sessionName, "C-c")
time.Sleep(100 * time.Millisecond)
// Kill the session
if err := t.KillSession(sessionName); err != nil {
return fmt.Errorf("killing session: %w", err)
}
fmt.Printf("%s Deacon session stopped.\n", style.Bold.Render("✓"))
return nil
}
func runDeaconAttach(cmd *cobra.Command, args []string) error {
t := tmux.NewTmux()
sessionName := getDeaconSessionName()
// Check if session exists
running, err := t.HasSession(sessionName)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if !running {
// Auto-start if not running
fmt.Println("Deacon session not running, starting...")
if err := startDeaconSession(t, sessionName, deaconAgentOverride); err != nil {
return err
}
}
// Session uses a respawn loop, so Claude restarts automatically if it exits
// Use shared attach helper (smart: links if inside tmux, attaches if outside)
return attachToTmuxSession(sessionName)
}
func runDeaconStatus(cmd *cobra.Command, args []string) error {
t := tmux.NewTmux()
sessionName := getDeaconSessionName()
// Check pause state first (most important)
townRoot, _ := workspace.FindFromCwdOrError()
if townRoot != "" {
paused, state, err := deacon.IsPaused(townRoot)
if err == nil && paused {
fmt.Printf("%s DEACON PAUSED\n", style.Bold.Render("⏸️"))
if state.Reason != "" {
fmt.Printf(" Reason: %s\n", state.Reason)
}
fmt.Printf(" Paused at: %s\n", state.PausedAt.Format(time.RFC3339))
fmt.Printf(" Paused by: %s\n", state.PausedBy)
fmt.Println()
fmt.Printf("Resume with: %s\n", style.Dim.Render("gt deacon resume"))
fmt.Println()
}
}
running, err := t.HasSession(sessionName)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if running {
// Get session info for more details
info, err := t.GetSessionInfo(sessionName)
if err == nil {
status := "detached"
if info.Attached {
status = "attached"
}
fmt.Printf("%s Deacon session is %s\n",
style.Bold.Render("●"),
style.Bold.Render("running"))
fmt.Printf(" Status: %s\n", status)
fmt.Printf(" Created: %s\n", info.Created)
fmt.Printf("\nAttach with: %s\n", style.Dim.Render("gt deacon attach"))
} else {
fmt.Printf("%s Deacon session is %s\n",
style.Bold.Render("●"),
style.Bold.Render("running"))
}
} else {
fmt.Printf("%s Deacon session is %s\n",
style.Dim.Render("○"),
"not running")
fmt.Printf("\nStart with: %s\n", style.Dim.Render("gt deacon start"))
}
return nil
}
func runDeaconRestart(cmd *cobra.Command, args []string) error {
t := tmux.NewTmux()
sessionName := getDeaconSessionName()
running, err := t.HasSession(sessionName)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
fmt.Println("Restarting Deacon...")
if running {
// Kill existing session
if err := t.KillSession(sessionName); err != nil {
style.PrintWarning("failed to kill session: %v", err)
}
}
// Start fresh
if err := runDeaconStart(cmd, args); err != nil {
return err
}
fmt.Printf("%s Deacon restarted\n", style.Bold.Render("✓"))
fmt.Printf(" %s\n", style.Dim.Render("Use 'gt deacon attach' to connect"))
return nil
}
func runDeaconHeartbeat(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Check if Deacon is paused - if so, refuse to update heartbeat
paused, state, err := deacon.IsPaused(townRoot)
if err != nil {
return fmt.Errorf("checking pause state: %w", err)
}
if paused {
fmt.Printf("%s Deacon is paused. Use 'gt deacon resume' to unpause.\n", style.Bold.Render("⏸️"))
if state.Reason != "" {
fmt.Printf(" Reason: %s\n", state.Reason)
}
return errors.New("Deacon is paused")
}
action := ""
if len(args) > 0 {
action = strings.Join(args, " ")
}
if action != "" {
if err := deacon.TouchWithAction(townRoot, action, 0, 0); err != nil {
return fmt.Errorf("updating heartbeat: %w", err)
}
fmt.Printf("%s Heartbeat updated: %s\n", style.Bold.Render("✓"), action)
} else {
if err := deacon.Touch(townRoot); err != nil {
return fmt.Errorf("updating heartbeat: %w", err)
}
fmt.Printf("%s Heartbeat updated\n", style.Bold.Render("✓"))
}
return nil
}
func runDeaconTriggerPending(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Step 1: Check inbox for new POLECAT_STARTED messages
pending, err := polecat.CheckInboxForSpawns(townRoot)
if err != nil {
return fmt.Errorf("checking inbox: %w", err)
}
if len(pending) == 0 {
fmt.Printf("%s No pending spawns\n", style.Dim.Render("○"))
return nil
}
fmt.Printf("%s Found %d pending spawn(s)\n", style.Bold.Render("●"), len(pending))
// Step 2: Try to trigger each pending spawn
results, err := polecat.TriggerPendingSpawns(townRoot, triggerTimeout)
if err != nil {
return fmt.Errorf("triggering: %w", err)
}
// Report results
triggered := 0
for _, r := range results {
if r.Triggered {
triggered++
fmt.Printf(" %s Triggered %s/%s\n",
style.Bold.Render("✓"),
r.Spawn.Rig, r.Spawn.Polecat)
} else if r.Error != nil {
fmt.Printf(" %s %s/%s: %v\n",
style.Dim.Render("⚠"),
r.Spawn.Rig, r.Spawn.Polecat, r.Error)
}
}
// Step 3: Prune stale pending spawns (older than 5 minutes)
pruned, _ := polecat.PruneStalePending(townRoot, 5*time.Minute)
if pruned > 0 {
fmt.Printf(" %s Pruned %d stale spawn(s)\n", style.Dim.Render("○"), pruned)
}
// Summary
remaining := len(pending) - triggered
if remaining > 0 {
fmt.Printf("%s %d spawn(s) still waiting for Claude\n",
style.Dim.Render("○"), remaining)
}
return nil
}
// runDeaconHealthCheck implements the health-check command.
// It sends a HEALTH_CHECK nudge to an agent, waits for response, and tracks state.
func runDeaconHealthCheck(cmd *cobra.Command, args []string) error {
agent := args[0]
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Load health check state
state, err := deacon.LoadHealthCheckState(townRoot)
if err != nil {
return fmt.Errorf("loading health check state: %w", err)
}
agentState := state.GetAgentState(agent)
// Check if agent is in cooldown
if agentState.IsInCooldown(healthCheckCooldown) {
remaining := agentState.CooldownRemaining(healthCheckCooldown)
fmt.Printf("%s Agent %s is in cooldown (remaining: %s)\n",
style.Dim.Render("○"), agent, remaining.Round(time.Second))
return nil
}
// Get agent bead info before ping (for baseline)
beadID, sessionName, err := agentAddressToIDs(agent)
if err != nil {
return fmt.Errorf("invalid agent address: %w", err)
}
t := tmux.NewTmux()
// Check if session exists
exists, err := t.HasSession(sessionName)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if !exists {
fmt.Printf("%s Agent %s session not running\n", style.Dim.Render("○"), agent)
return nil
}
// Get current bead update time
baselineTime, err := getAgentBeadUpdateTime(townRoot, beadID)
if err != nil {
// Bead might not exist yet - that's okay
baselineTime = time.Time{}
}
// Record ping
agentState.RecordPing()
// Send health check nudge
if err := t.NudgeSession(sessionName, "HEALTH_CHECK: respond with any action to confirm responsiveness"); err != nil {
return fmt.Errorf("sending nudge: %w", err)
}
fmt.Printf("%s Sent HEALTH_CHECK to %s, waiting %s...\n",
style.Bold.Render("→"), agent, healthCheckTimeout)
// Wait for response
deadline := time.Now().Add(healthCheckTimeout)
responded := false
for time.Now().Before(deadline) {
time.Sleep(2 * time.Second) // Check every 2 seconds
newTime, err := getAgentBeadUpdateTime(townRoot, beadID)
if err != nil {
continue
}
// If bead was updated after our baseline, agent responded
if newTime.After(baselineTime) {
responded = true
break
}
}
// Record result
if responded {
agentState.RecordResponse()
if err := deacon.SaveHealthCheckState(townRoot, state); err != nil {
style.PrintWarning("failed to save health check state: %v", err)
}
fmt.Printf("%s Agent %s responded (failures reset to 0)\n",
style.Bold.Render("✓"), agent)
return nil
}
// No response - record failure
agentState.RecordFailure()
if err := deacon.SaveHealthCheckState(townRoot, state); err != nil {
style.PrintWarning("failed to save health check state: %v", err)
}
fmt.Printf("%s Agent %s did not respond (consecutive failures: %d/%d)\n",
style.Dim.Render("⚠"), agent, agentState.ConsecutiveFailures, healthCheckFailures)
// Check if force-kill threshold reached
if agentState.ShouldForceKill(healthCheckFailures) {
fmt.Printf("%s Agent %s should be force-killed\n", style.Bold.Render("✗"), agent)
os.Exit(2) // Exit code 2 = should force-kill
}
return nil
}
// runDeaconForceKill implements the force-kill command.
// It kills a stuck agent session and updates its bead state.
func runDeaconForceKill(cmd *cobra.Command, args []string) error {
agent := args[0]
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Load health check state
state, err := deacon.LoadHealthCheckState(townRoot)
if err != nil {
return fmt.Errorf("loading health check state: %w", err)
}
agentState := state.GetAgentState(agent)
// Check cooldown (unless bypassed)
if agentState.IsInCooldown(healthCheckCooldown) {
remaining := agentState.CooldownRemaining(healthCheckCooldown)
return fmt.Errorf("agent %s is in cooldown (remaining: %s) - cannot force-kill yet",
agent, remaining.Round(time.Second))
}
// Get session name
_, sessionName, err := agentAddressToIDs(agent)
if err != nil {
return fmt.Errorf("invalid agent address: %w", err)
}
t := tmux.NewTmux()
// Check if session exists
exists, err := t.HasSession(sessionName)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if !exists {
fmt.Printf("%s Agent %s session not running\n", style.Dim.Render("○"), agent)
return nil
}
// Build reason
reason := forceKillReason
if reason == "" {
reason = fmt.Sprintf("unresponsive after %d consecutive health check failures",
agentState.ConsecutiveFailures)
}
// Step 1: Log the intervention (send mail to agent)
fmt.Printf("%s Sending force-kill notification to %s...\n", style.Dim.Render("1."), agent)
mailBody := fmt.Sprintf("Deacon detected %s as unresponsive.\nReason: %s\nAction: force-killing session", agent, reason)
sendMail(townRoot, agent, "FORCE_KILL: unresponsive", mailBody)
// Step 2: Kill the tmux session
fmt.Printf("%s Killing tmux session %s...\n", style.Dim.Render("2."), sessionName)
if err := t.KillSession(sessionName); err != nil {
return fmt.Errorf("killing session: %w", err)
}
// Step 3: Update agent bead state (optional - best effort)
fmt.Printf("%s Updating agent bead state to 'killed'...\n", style.Dim.Render("3."))
updateAgentBeadState(townRoot, agent, "killed", reason)
// Step 4: Notify mayor (optional)
if !forceKillSkipNotify {
fmt.Printf("%s Notifying mayor...\n", style.Dim.Render("4."))
notifyBody := fmt.Sprintf("Agent %s was force-killed by Deacon.\nReason: %s", agent, reason)
sendMail(townRoot, "mayor/", "Agent killed: "+agent, notifyBody)
}
// Record force-kill in state
agentState.RecordForceKill()
if err := deacon.SaveHealthCheckState(townRoot, state); err != nil {
style.PrintWarning("failed to save health check state: %v", err)
}
fmt.Printf("%s Force-killed agent %s (total kills: %d)\n",
style.Bold.Render("✓"), agent, agentState.ForceKillCount)
fmt.Printf(" %s\n", style.Dim.Render("Agent is now 'asleep'. Use 'gt rig boot' to restart."))
return nil
}
// runDeaconHealthState shows the current health check state.
func runDeaconHealthState(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
state, err := deacon.LoadHealthCheckState(townRoot)
if err != nil {
return fmt.Errorf("loading health check state: %w", err)
}
if len(state.Agents) == 0 {
fmt.Printf("%s No health check state recorded yet\n", style.Dim.Render("○"))
return nil
}
fmt.Printf("%s Health Check State (updated %s)\n\n",
style.Bold.Render("●"),
state.LastUpdated.Format(time.RFC3339))
for agentID, agentState := range state.Agents {
fmt.Printf("Agent: %s\n", style.Bold.Render(agentID))
if !agentState.LastPingTime.IsZero() {
fmt.Printf(" Last ping: %s ago\n", time.Since(agentState.LastPingTime).Round(time.Second))
}
if !agentState.LastResponseTime.IsZero() {
fmt.Printf(" Last response: %s ago\n", time.Since(agentState.LastResponseTime).Round(time.Second))
}
fmt.Printf(" Consecutive failures: %d\n", agentState.ConsecutiveFailures)
fmt.Printf(" Total force-kills: %d\n", agentState.ForceKillCount)
if !agentState.LastForceKillTime.IsZero() {
fmt.Printf(" Last force-kill: %s ago\n", time.Since(agentState.LastForceKillTime).Round(time.Second))
if agentState.IsInCooldown(healthCheckCooldown) {
remaining := agentState.CooldownRemaining(healthCheckCooldown)
fmt.Printf(" Cooldown: %s remaining\n", remaining.Round(time.Second))
}
}
fmt.Println()
}
return nil
}
// agentAddressToIDs converts an agent address to bead ID and session name.
// Supports formats: "gastown/polecats/max", "gastown/witness", "deacon", "mayor"
// Note: Town-level agents (Mayor, Deacon) use hq- prefix bead IDs stored in town beads.
func agentAddressToIDs(address string) (beadID, sessionName string, err error) {
switch address {
case "deacon":
return beads.DeaconBeadIDTown(), session.DeaconSessionName(), nil
case "mayor":
return beads.MayorBeadIDTown(), session.MayorSessionName(), nil
}
parts := strings.Split(address, "/")
switch len(parts) {
case 2:
// rig/role: "gastown/witness", "gastown/refinery"
rig, role := parts[0], parts[1]
switch role {
case "witness":
return fmt.Sprintf("gt-%s-witness", rig), fmt.Sprintf("gt-%s-witness", rig), nil
case "refinery":
return fmt.Sprintf("gt-%s-refinery", rig), fmt.Sprintf("gt-%s-refinery", rig), nil
default:
return "", "", fmt.Errorf("unknown role: %s", role)
}
case 3:
// rig/type/name: "gastown/polecats/max", "gastown/crew/alpha"
rig, agentType, name := parts[0], parts[1], parts[2]
switch agentType {
case "polecats":
return fmt.Sprintf("gt-%s-polecat-%s", rig, name), fmt.Sprintf("gt-%s-%s", rig, name), nil
case "crew":
return fmt.Sprintf("gt-%s-crew-%s", rig, name), fmt.Sprintf("gt-%s-crew-%s", rig, name), nil
default:
return "", "", fmt.Errorf("unknown agent type: %s", agentType)
}
default:
return "", "", fmt.Errorf("invalid agent address format: %s (expected rig/type/name or rig/role)", address)
}
}
// getAgentBeadUpdateTime gets the update time from an agent bead.
func getAgentBeadUpdateTime(townRoot, beadID string) (time.Time, error) {
cmd := exec.Command("bd", "show", beadID, "--json")
cmd.Dir = townRoot
output, err := cmd.Output()
if err != nil {
return time.Time{}, err
}
var issues []struct {
UpdatedAt string `json:"updated_at"`
}
if err := json.Unmarshal(output, &issues); err != nil {
return time.Time{}, err
}
if len(issues) == 0 {
return time.Time{}, fmt.Errorf("bead not found: %s", beadID)
}
return time.Parse(time.RFC3339, issues[0].UpdatedAt)
}
// sendMail sends a mail message using gt mail send.
func sendMail(townRoot, to, subject, body string) {
cmd := exec.Command("gt", "mail", "send", to, "-s", subject, "-m", body)
cmd.Dir = townRoot
_ = cmd.Run() // Best effort
}
// updateAgentBeadState updates an agent bead's state.
func updateAgentBeadState(townRoot, agent, state, _ string) { // reason unused but kept for API consistency
beadID, _, err := agentAddressToIDs(agent)
if err != nil {
return
}
// Use bd agent state command
cmd := exec.Command("bd", "agent", "state", beadID, state)
cmd.Dir = townRoot
_ = cmd.Run() // Best effort
}
// runDeaconStaleHooks finds and unhooks stale hooked beads.
func runDeaconStaleHooks(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
cfg := &deacon.StaleHookConfig{
MaxAge: staleHooksMaxAge,
DryRun: staleHooksDryRun,
}
result, err := deacon.ScanStaleHooks(townRoot, cfg)
if err != nil {
return fmt.Errorf("scanning stale hooks: %w", err)
}
// Print summary
if result.TotalHooked == 0 {
fmt.Printf("%s No hooked beads found\n", style.Dim.Render("○"))
return nil
}
fmt.Printf("%s Found %d hooked bead(s), %d stale (older than %s)\n",
style.Bold.Render("●"), result.TotalHooked, result.StaleCount, staleHooksMaxAge)
if result.StaleCount == 0 {
fmt.Printf("%s No stale hooked beads\n", style.Dim.Render("○"))
return nil
}
// Print details for each stale bead
for _, r := range result.Results {
status := style.Dim.Render("○")
action := "skipped (agent alive)"
if !r.AgentAlive {
if staleHooksDryRun {
status = style.Bold.Render("?")
action = "would unhook (agent dead)"
} else if r.Unhooked {
status = style.Bold.Render("✓")
action = "unhooked (agent dead)"
} else if r.Error != "" {
status = style.Dim.Render("✗")
action = fmt.Sprintf("error: %s", r.Error)
}
}
fmt.Printf(" %s %s: %s (age: %s, assignee: %s)\n",
status, r.BeadID, action, r.Age, r.Assignee)
}
// Summary
if staleHooksDryRun {
fmt.Printf("\n%s Dry run - no changes made. Run without --dry-run to unhook.\n",
style.Dim.Render(""))
} else if result.Unhooked > 0 {
fmt.Printf("\n%s Unhooked %d stale bead(s)\n",
style.Bold.Render("✓"), result.Unhooked)
}
return nil
}
// runDeaconPause pauses the Deacon to prevent patrol actions.
func runDeaconPause(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Check if already paused
paused, state, err := deacon.IsPaused(townRoot)
if err != nil {
return fmt.Errorf("checking pause state: %w", err)
}
if paused {
fmt.Printf("%s Deacon is already paused\n", style.Dim.Render("○"))
fmt.Printf(" Reason: %s\n", state.Reason)
fmt.Printf(" Paused at: %s\n", state.PausedAt.Format(time.RFC3339))
fmt.Printf(" Paused by: %s\n", state.PausedBy)
return nil
}
// Pause the Deacon
if err := deacon.Pause(townRoot, pauseReason, "human"); err != nil {
return fmt.Errorf("pausing Deacon: %w", err)
}
fmt.Printf("%s Deacon paused\n", style.Bold.Render("⏸️"))
if pauseReason != "" {
fmt.Printf(" Reason: %s\n", pauseReason)
}
fmt.Printf(" Pause file: %s\n", deacon.GetPauseFile(townRoot))
fmt.Println()
fmt.Printf("The Deacon will not perform any patrol actions until resumed.\n")
fmt.Printf("Resume with: %s\n", style.Dim.Render("gt deacon resume"))
return nil
}
// runDeaconResume resumes the Deacon to allow patrol actions.
func runDeaconResume(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Check if paused
paused, _, err := deacon.IsPaused(townRoot)
if err != nil {
return fmt.Errorf("checking pause state: %w", err)
}
if !paused {
fmt.Printf("%s Deacon is not paused\n", style.Dim.Render("○"))
return nil
}
// Resume the Deacon
if err := deacon.Resume(townRoot); err != nil {
return fmt.Errorf("resuming Deacon: %w", err)
}
fmt.Printf("%s Deacon resumed\n", style.Bold.Render("▶️"))
fmt.Println("The Deacon can now perform patrol actions.")
return nil
}