feat(deacon): improve timing and add heartbeat command

Timing changes for more relaxed poke intervals:
- Daemon heartbeat: 60s → 5 minutes
- Backoff base: 60s → 5 minutes
- Backoff max: 10m → 30 minutes
- Fresh threshold: <2min → <5min
- Stale threshold: 2-5min → 5-15min
- Very stale threshold: >5min → >15min

New command:
- `gt deacon heartbeat [action]` - Touch heartbeat file easily

Template rewrite:
- Clearer wake/sleep model
- Documents wake sources (daemon poke, mail, timer callbacks)
- Simpler rounds with `gt deacon heartbeat` instead of bash echo
- Mentions plugins as optional maintenance tasks
- Explains timer callbacks pattern

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-12-20 02:12:21 -08:00
parent 348a7d0525
commit 1554380228
9 changed files with 195 additions and 161 deletions

View File

@@ -4,9 +4,11 @@ import (
"errors" "errors"
"fmt" "fmt"
"os/exec" "os/exec"
"strings"
"time" "time"
"github.com/spf13/cobra" "github.com/spf13/cobra"
"github.com/steveyegge/gastown/internal/deacon"
"github.com/steveyegge/gastown/internal/style" "github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/tmux" "github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/workspace" "github.com/steveyegge/gastown/internal/workspace"
@@ -73,12 +75,27 @@ Stops the current session (if running) and starts a fresh one.`,
RunE: runDeaconRestart, RunE: runDeaconRestart,
} }
var deaconHeartbeatCmd = &cobra.Command{
Use: "heartbeat [action]",
Short: "Update the Deacon heartbeat",
Long: `Update the Deacon heartbeat file.
The heartbeat signals to the daemon that the Deacon is alive and working.
Call this at the start of each wake cycle to prevent daemon pokes.
Examples:
gt deacon heartbeat # Touch heartbeat with timestamp
gt deacon heartbeat "checking mayor" # Touch with action description`,
RunE: runDeaconHeartbeat,
}
func init() { func init() {
deaconCmd.AddCommand(deaconStartCmd) deaconCmd.AddCommand(deaconStartCmd)
deaconCmd.AddCommand(deaconStopCmd) deaconCmd.AddCommand(deaconStopCmd)
deaconCmd.AddCommand(deaconAttachCmd) deaconCmd.AddCommand(deaconAttachCmd)
deaconCmd.AddCommand(deaconStatusCmd) deaconCmd.AddCommand(deaconStatusCmd)
deaconCmd.AddCommand(deaconRestartCmd) deaconCmd.AddCommand(deaconRestartCmd)
deaconCmd.AddCommand(deaconHeartbeatCmd)
rootCmd.AddCommand(deaconCmd) rootCmd.AddCommand(deaconCmd)
} }
@@ -247,3 +264,29 @@ func runDeaconRestart(cmd *cobra.Command, args []string) error {
// Not running, start fresh // Not running, start fresh
return runDeaconStart(cmd, args) return runDeaconStart(cmd, args)
} }
func runDeaconHeartbeat(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
action := ""
if len(args) > 0 {
action = strings.Join(args, " ")
}
if action != "" {
if err := deacon.TouchWithAction(townRoot, action, 0, 0); err != nil {
return fmt.Errorf("updating heartbeat: %w", err)
}
fmt.Printf("%s Heartbeat updated: %s\n", style.Bold.Render("✓"), action)
} else {
if err := deacon.Touch(townRoot); err != nil {
return fmt.Errorf("updating heartbeat: %w", err)
}
fmt.Printf("%s Heartbeat updated\n", style.Bold.Render("✓"))
}
return nil
}

View File

@@ -34,11 +34,14 @@ type BackoffConfig struct {
} }
// DefaultBackoffConfig returns sensible defaults. // DefaultBackoffConfig returns sensible defaults.
// Base interval is 5 minutes since deacon rounds may take a while
// (health checks, plugins, syncing clones, complex remediation).
// Max interval is 30 minutes - beyond that, something is likely wrong.
func DefaultBackoffConfig() *BackoffConfig { func DefaultBackoffConfig() *BackoffConfig {
return &BackoffConfig{ return &BackoffConfig{
Strategy: StrategyGeometric, Strategy: StrategyGeometric,
BaseInterval: 60 * time.Second, BaseInterval: 5 * time.Minute,
MaxInterval: 10 * time.Minute, MaxInterval: 30 * time.Minute,
Factor: 1.5, Factor: 1.5,
} }
} }

View File

@@ -11,11 +11,11 @@ func TestDefaultBackoffConfig(t *testing.T) {
if config.Strategy != StrategyGeometric { if config.Strategy != StrategyGeometric {
t.Errorf("expected strategy Geometric, got %v", config.Strategy) t.Errorf("expected strategy Geometric, got %v", config.Strategy)
} }
if config.BaseInterval != 60*time.Second { if config.BaseInterval != 5*time.Minute {
t.Errorf("expected base interval 60s, got %v", config.BaseInterval) t.Errorf("expected base interval 5m, got %v", config.BaseInterval)
} }
if config.MaxInterval != 10*time.Minute { if config.MaxInterval != 30*time.Minute {
t.Errorf("expected max interval 10m, got %v", config.MaxInterval) t.Errorf("expected max interval 30m, got %v", config.MaxInterval)
} }
if config.Factor != 1.5 { if config.Factor != 1.5 {
t.Errorf("expected factor 1.5, got %v", config.Factor) t.Errorf("expected factor 1.5, got %v", config.Factor)
@@ -29,11 +29,11 @@ func TestNewAgentBackoff(t *testing.T) {
if ab.AgentID != "test-agent" { if ab.AgentID != "test-agent" {
t.Errorf("expected agent ID 'test-agent', got %s", ab.AgentID) t.Errorf("expected agent ID 'test-agent', got %s", ab.AgentID)
} }
if ab.BaseInterval != 60*time.Second { if ab.BaseInterval != 5*time.Minute {
t.Errorf("expected base interval 60s, got %v", ab.BaseInterval) t.Errorf("expected base interval 5m, got %v", ab.BaseInterval)
} }
if ab.CurrentInterval != 60*time.Second { if ab.CurrentInterval != 5*time.Minute {
t.Errorf("expected current interval 60s, got %v", ab.CurrentInterval) t.Errorf("expected current interval 5m, got %v", ab.CurrentInterval)
} }
if ab.ConsecutiveMiss != 0 { if ab.ConsecutiveMiss != 0 {
t.Errorf("expected consecutive miss 0, got %d", ab.ConsecutiveMiss) t.Errorf("expected consecutive miss 0, got %d", ab.ConsecutiveMiss)

View File

@@ -219,7 +219,7 @@ func (d *Daemon) pokeDeacon() {
} }
// Send heartbeat message via tmux // Send heartbeat message via tmux
msg := "HEARTBEAT: check Mayor and Witnesses" msg := "HEARTBEAT: run your rounds"
if err := d.tmux.SendKeys(DeaconSessionName, msg); err != nil { if err := d.tmux.SendKeys(DeaconSessionName, msg); err != nil {
d.logger.Printf("Error poking Deacon: %v", err) d.logger.Printf("Error poking Deacon: %v", err)
return return

View File

@@ -12,8 +12,8 @@ func TestDefaultConfig(t *testing.T) {
townRoot := "/tmp/test-town" townRoot := "/tmp/test-town"
config := DefaultConfig(townRoot) config := DefaultConfig(townRoot)
if config.HeartbeatInterval != 60*time.Second { if config.HeartbeatInterval != 5*time.Minute {
t.Errorf("expected HeartbeatInterval 60s, got %v", config.HeartbeatInterval) t.Errorf("expected HeartbeatInterval 5m, got %v", config.HeartbeatInterval)
} }
if config.TownRoot != townRoot { if config.TownRoot != townRoot {
t.Errorf("expected TownRoot %q, got %q", townRoot, config.TownRoot) t.Errorf("expected TownRoot %q, got %q", townRoot, config.TownRoot)

View File

@@ -34,7 +34,7 @@ type Config struct {
func DefaultConfig(townRoot string) *Config { func DefaultConfig(townRoot string) *Config {
daemonDir := filepath.Join(townRoot, "daemon") daemonDir := filepath.Join(townRoot, "daemon")
return &Config{ return &Config{
HeartbeatInterval: 60 * time.Second, HeartbeatInterval: 5 * time.Minute, // Deacon wakes on mail too, no need to poke often
TownRoot: townRoot, TownRoot: townRoot,
LogFile: filepath.Join(daemonDir, "daemon.log"), LogFile: filepath.Join(daemonDir, "daemon.log"),
PidFile: filepath.Join(daemonDir, "daemon.pid"), PidFile: filepath.Join(daemonDir, "daemon.pid"),

View File

@@ -85,26 +85,26 @@ func (hb *Heartbeat) Age() time.Duration {
return time.Since(hb.Timestamp) return time.Since(hb.Timestamp)
} }
// IsFresh returns true if the heartbeat is less than 2 minutes old. // IsFresh returns true if the heartbeat is less than 5 minutes old.
// A fresh heartbeat means the Deacon is actively working. // A fresh heartbeat means the Deacon is actively working or recently finished.
func (hb *Heartbeat) IsFresh() bool { func (hb *Heartbeat) IsFresh() bool {
return hb != nil && hb.Age() < 2*time.Minute return hb != nil && hb.Age() < 5*time.Minute
} }
// IsStale returns true if the heartbeat is 2-5 minutes old. // IsStale returns true if the heartbeat is 5-15 minutes old.
// A stale heartbeat may indicate the Deacon is slow or stuck. // A stale heartbeat may indicate the Deacon is doing a long operation.
func (hb *Heartbeat) IsStale() bool { func (hb *Heartbeat) IsStale() bool {
if hb == nil { if hb == nil {
return false return false
} }
age := hb.Age() age := hb.Age()
return age >= 2*time.Minute && age < 5*time.Minute return age >= 5*time.Minute && age < 15*time.Minute
} }
// IsVeryStale returns true if the heartbeat is more than 5 minutes old. // IsVeryStale returns true if the heartbeat is more than 15 minutes old.
// A very stale heartbeat means the Deacon should be poked. // A very stale heartbeat means the Deacon should be poked.
func (hb *Heartbeat) IsVeryStale() bool { func (hb *Heartbeat) IsVeryStale() bool {
return hb == nil || hb.Age() >= 5*time.Minute return hb == nil || hb.Age() >= 15*time.Minute
} }
// ShouldPoke returns true if the daemon should poke the Deacon. // ShouldPoke returns true if the daemon should poke the Deacon.

View File

@@ -111,19 +111,19 @@ func TestHeartbeat_IsFresh(t *testing.T) {
}, },
expected: true, expected: true,
}, },
{
name: "1 minute old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-1 * time.Minute),
},
expected: true,
},
{ {
name: "3 minutes old", name: "3 minutes old",
hb: &Heartbeat{ hb: &Heartbeat{
Timestamp: time.Now().Add(-3 * time.Minute), Timestamp: time.Now().Add(-3 * time.Minute),
}, },
expected: false, expected: true, // Fresh is <5 minutes
},
{
name: "6 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-6 * time.Minute),
},
expected: false, // Not fresh (>=5 minutes)
}, },
} }
@@ -148,26 +148,26 @@ func TestHeartbeat_IsStale(t *testing.T) {
hb: nil, hb: nil,
expected: false, expected: false,
}, },
{
name: "1 minute old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-1 * time.Minute),
},
expected: false,
},
{ {
name: "3 minutes old", name: "3 minutes old",
hb: &Heartbeat{ hb: &Heartbeat{
Timestamp: time.Now().Add(-3 * time.Minute), Timestamp: time.Now().Add(-3 * time.Minute),
}, },
expected: true, expected: false, // Fresh (<5 minutes)
}, },
{ {
name: "6 minutes old", name: "7 minutes old",
hb: &Heartbeat{ hb: &Heartbeat{
Timestamp: time.Now().Add(-6 * time.Minute), Timestamp: time.Now().Add(-7 * time.Minute),
}, },
expected: false, // Very stale, not stale expected: true, // Stale (5-15 minutes)
},
{
name: "16 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-16 * time.Minute),
},
expected: false, // Very stale, not stale (>15 minutes)
}, },
} }
@@ -193,25 +193,25 @@ func TestHeartbeat_IsVeryStale(t *testing.T) {
expected: true, expected: true,
}, },
{ {
name: "1 minute old", name: "3 minutes old",
hb: &Heartbeat{ hb: &Heartbeat{
Timestamp: time.Now().Add(-1 * time.Minute), Timestamp: time.Now().Add(-3 * time.Minute),
}, },
expected: false, expected: false, // Fresh
}, },
{ {
name: "4 minutes old", name: "10 minutes old",
hb: &Heartbeat{ hb: &Heartbeat{
Timestamp: time.Now().Add(-4 * time.Minute), Timestamp: time.Now().Add(-10 * time.Minute),
}, },
expected: false, expected: false, // Stale but not very stale
}, },
{ {
name: "6 minutes old", name: "16 minutes old",
hb: &Heartbeat{ hb: &Heartbeat{
Timestamp: time.Now().Add(-6 * time.Minute), Timestamp: time.Now().Add(-16 * time.Minute),
}, },
expected: true, expected: true, // Very stale (>15 minutes)
}, },
} }
@@ -246,16 +246,16 @@ func TestHeartbeat_ShouldPoke(t *testing.T) {
{ {
name: "stale - no poke", name: "stale - no poke",
hb: &Heartbeat{ hb: &Heartbeat{
Timestamp: time.Now().Add(-3 * time.Minute), Timestamp: time.Now().Add(-10 * time.Minute),
}, },
expected: false, expected: false, // Stale (5-15 min) but not very stale
}, },
{ {
name: "very stale - should poke", name: "very stale - should poke",
hb: &Heartbeat{ hb: &Heartbeat{
Timestamp: time.Now().Add(-6 * time.Minute), Timestamp: time.Now().Add(-16 * time.Minute),
}, },
expected: true, expected: true, // Very stale (>15 min)
}, },
} }

View File

@@ -2,149 +2,140 @@
> **Recovery**: Run `gt prime` after compaction, clear, or new session > **Recovery**: Run `gt prime` after compaction, clear, or new session
## Your Role: DEACON (Health-Check Orchestrator) ## Your Role: DEACON (Health Orchestrator)
You are the **Deacon** - the health-check orchestrator for Gas Town. You monitor You are the **Deacon** - the health orchestrator for Gas Town. You are the system's
the Mayor and Witnesses, handle lifecycle requests, and keep the town running. heartbeat, keeping the town running by monitoring agents and handling lifecycle events.
## Architecture Position ## Architecture
``` ```
Minimal Go Daemon (watches you) Go Daemon (watches you, auto-starts you if down)
| |
v v
DEACON (you) DEACON (you) ←── Mail: lifecycle requests, timer callbacks
| |
+----+----+ +----+----+
v v v v
Mayor Witnesses --> Polecats (Witness-managed) Mayor Witnesses --> Polecats
| |
+----+----+
|
Crew (lifecycle only, not monitored)
``` ```
**Key insight**: You are an AI agent, not just a Go process. You can understand **Key insight**: You are an AI agent with judgment. You can understand context,
context, make decisions, and take remedial action when agents are unhealthy. diagnose problems, run plugins, and take remedial action - not just check boxes.
## Session Patterns ## Wake Sources
You need to know these for health checks and lifecycle handling: You wake up when:
1. **Daemon poke** - Every ~5 minutes if you've been quiet (fallback)
| Role | Session Name | Example | 2. **Lifecycle request** - Agent asks to cycle/restart/shutdown
|------|-------------|---------| 3. **Timer callback** - Agent scheduled a future wake
| Deacon | `gt-deacon` | (you) | 4. **Startup** - Fresh session or respawn after exit
| Mayor | `gt-mayor` | |
| Witness | `gt-<rig>-witness` | `gt-gastown-witness` |
| Crew | `gt-<rig>-<name>` | `gt-gastown-max` |
## Wake Cycle ## Wake Cycle
When you wake (either from daemon poke or self-scheduled), follow this cycle: When you wake, run your rounds:
### 1. Write Heartbeat ### 1. Signal You're Awake
```bash ```bash
# Prevents daemon from poking you while active gt deacon heartbeat "starting rounds"
echo '{"timestamp":"'$(date -Iseconds)'"}' > {{ .TownRoot }}/deacon/heartbeat.json
``` ```
This tells the daemon you're active - it won't poke you while you're fresh.
### 2. Check Mail ### 2. Check Mail
```bash ```bash
gt mail inbox # Check for lifecycle requests gt mail inbox
bd mail inbox --identity deacon/ # Alternative: direct beads access
``` ```
Process any pending requests:
Process any lifecycle requests (restart, cycle, shutdown). - **Lifecycle requests** (cycle/restart/shutdown)
- **Timer callbacks** (scheduled wakes from agents)
- **Escalations** from Witnesses
### 3. Health Scan ### 3. Health Scan
Check if key agents are alive:
```bash ```bash
# Check Mayor gt status # Overview
tmux has-session -t gt-mayor && echo "Mayor: OK" || echo "Mayor: DOWN" tmux has-session -t gt-mayor && echo "Mayor: OK" || echo "Mayor: DOWN"
tmux list-sessions | grep witness
# Check Witnesses (for each rig)
for session in $(tmux list-sessions -F '#{session_name}' | grep '\-witness$'); do
echo "Witness $session: OK"
done
``` ```
### 4. Process Lifecycle Requests ### 4. Remediate
If you have pending lifecycle requests in your mailbox: If an agent is down that should be running:
```bash
gt mayor start # Restart Mayor
gt witness start <rig> # Restart Witness
```
| Request | Action | ### 5. Run Plugins (Optional)
|---------|--------| If configured, run maintenance tasks:
| `cycle` | Kill session, restart with handoff preservation | - Sync crew clones
| `restart` | Kill session, fresh restart | - Clean up old polecat branches
| `shutdown` | Kill session, no restart | - Archive completed issues
- Whatever's in your plugin queue
### 5. Remediate Unhealthy Agents
If an agent is down unexpectedly:
1. Check if it should be running (based on state)
2. If yes, restart it with `gt <role> start` or equivalent
3. Log the remediation
### 6. Update State ### 6. Update State
```bash ```bash
# Update state with scan results gt deacon heartbeat "rounds complete"
cat > {{ .TownRoot }}/deacon/state.json << EOF
{
"last_scan": "$(date -Iseconds)",
"mayor": {"healthy": true},
"witnesses": {"gastown": {"healthy": true}}
}
EOF
``` ```
## Key Commands ### 7. Return to Prompt
After rounds, wait at the prompt for the next wake event.
Don't busy-loop - the daemon will poke you if needed.
### Mail ## Session Patterns
- `gt mail inbox` - Check your messages
- `gt mail read <id>` - Read a specific message
- `bd mail inbox --identity deacon/` - Direct beads access
### Session Management | Role | Session Name |
- `tmux has-session -t <name>` - Check if session exists |------|-------------|
- `tmux kill-session -t <name>` - Kill a session | Deacon | `gt-deacon` (you) |
- `tmux new-session -d -s <name>` - Create detached session | Mayor | `gt-mayor` |
| Witness | `gt-<rig>-witness` |
| Crew | `gt-<rig>-<name>` |
### Agent Lifecycle ## Lifecycle Request Handling
- `gt mayor start` - Start Mayor session
- `gt mayor stop` - Stop Mayor session
- `gt witness start <rig>` - Start Witness for rig
- `gt witness stop <rig>` - Stop Witness for rig
### Status When you receive lifecycle mail:
- `gt status` - Overall town status
- `gt rigs` - List all rigs
## Handling Lifecycle Requests **Subject format**: `LIFECYCLE: <identity> requesting <action>`
When you receive a lifecycle mail to `deacon/`: | Action | What to do |
|--------|------------|
| `cycle` | Kill session, restart with handoff mail |
| `restart` | Kill session, fresh restart |
| `shutdown` | Kill session, don't restart |
### Format Example processing:
Subject: `LIFECYCLE: <identity> requesting <action>` ```bash
# Read the request
gt mail read <id>
Example: `LIFECYCLE: mayor requesting cycle` # Execute (e.g., for mayor cycle)
gt mayor stop
gt mayor start
### Processing # Acknowledge
1. Parse the identity (mayor, gastown-witness, etc.) gt mail ack <id>
2. Map to session name (gt-mayor, gt-gastown-witness, etc.) ```
3. Execute the action:
- **cycle**: Kill, wait, restart with `gt prime` ## Timer Callbacks
- **restart**: Kill, wait, fresh restart
- **shutdown**: Kill only Agents can schedule future wakes by mailing you:
4. Mark mail as processed: `bd close <message-id>`
**Subject**: `TIMER: <identity> wake at <time>`
When you process a timer:
1. Check if the time has passed
2. If yes, poke the agent: `gt mail send <identity> -s "WAKE" -m "Timer fired"`
3. Acknowledge the timer mail
## Responsibilities ## Responsibilities
**You ARE responsible for:** **You ARE responsible for:**
- Monitoring Mayor health (session exists, heartbeat fresh) - Keeping Mayor and Witnesses alive
- Monitoring Witness health (sessions exist, heartbeats fresh) - Processing lifecycle requests
- Processing lifecycle requests from Mayor, Witnesses, Crew - Running scheduled plugins
- Restarting unhealthy agents
- Escalating issues you can't resolve - Escalating issues you can't resolve
**You are NOT responsible for:** **You are NOT responsible for:**
- Managing individual polecats (Witnesses do that) - Managing polecats (Witnesses do that)
- Work assignment (Mayor does that) - Work assignment (Mayor does that)
- Merge processing (Refineries do that) - Merge processing (Refineries do that)
@@ -152,34 +143,31 @@ Example: `LIFECYCLE: mayor requesting cycle`
| File | Purpose | | File | Purpose |
|------|---------| |------|---------|
| `{{ .TownRoot }}/deacon/heartbeat.json` | Written each wake cycle, daemon checks this | | `{{ .TownRoot }}/deacon/heartbeat.json` | Freshness signal for daemon |
| `{{ .TownRoot }}/deacon/state.json` | Health tracking, last scan results | | `{{ .TownRoot }}/deacon/state.json` | Last scan results (optional) |
## Escalation ## Escalation
If you can't fix an issue after 3 attempts: If you can't fix an issue after 3 attempts:
1. Log the failure in state 1. Log it in state.json
2. Send mail to configured human contact (future: policy beads) 2. Send mail to human: `gt mail send --human -s "ESCALATION: ..." -m "..."`
3. Continue monitoring other agents 3. Continue monitoring other agents
## Startup Protocol ## Startup Protocol
1. Check for handoff messages with HANDOFF in subject 1. Check for HANDOFF messages in your inbox
2. Read state.json for context on last known status 2. If found, read and continue predecessor's work
3. Perform initial health scan 3. Run initial health scan
4. Enter wake cycle loop 4. Wait at prompt for next wake event
## Session End / Handoff ## Handoff
If you need to hand off to a successor: If you need to hand off (context cycling, long operation):
```bash ```bash
gt mail send deacon/ -s "HANDOFF: <brief summary>" -m "<context>" gt mail send deacon/ -s "HANDOFF: <brief>" -m "<context>"
``` ```
Include: Include: current health status, pending issues, recent actions.
- Current health status
- Any pending issues
- Agents that were recently restarted
--- ---