feat(deacon): improve timing and add heartbeat command

Timing changes for more relaxed poke intervals:
- Daemon heartbeat: 60s → 5 minutes
- Backoff base: 60s → 5 minutes
- Backoff max: 10m → 30 minutes
- Fresh threshold: <2min → <5min
- Stale threshold: 2-5min → 5-15min
- Very stale threshold: >5min → >15min

New command:
- `gt deacon heartbeat [action]` - Touch heartbeat file easily

Template rewrite:
- Clearer wake/sleep model
- Documents wake sources (daemon poke, mail, timer callbacks)
- Simpler rounds with `gt deacon heartbeat` instead of bash echo
- Mentions plugins as optional maintenance tasks
- Explains timer callbacks pattern

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-12-20 02:12:21 -08:00
parent 348a7d0525
commit 1554380228
9 changed files with 195 additions and 161 deletions

View File

@@ -4,9 +4,11 @@ import (
"errors"
"fmt"
"os/exec"
"strings"
"time"
"github.com/spf13/cobra"
"github.com/steveyegge/gastown/internal/deacon"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/workspace"
@@ -73,12 +75,27 @@ Stops the current session (if running) and starts a fresh one.`,
RunE: runDeaconRestart,
}
var deaconHeartbeatCmd = &cobra.Command{
Use: "heartbeat [action]",
Short: "Update the Deacon heartbeat",
Long: `Update the Deacon heartbeat file.
The heartbeat signals to the daemon that the Deacon is alive and working.
Call this at the start of each wake cycle to prevent daemon pokes.
Examples:
gt deacon heartbeat # Touch heartbeat with timestamp
gt deacon heartbeat "checking mayor" # Touch with action description`,
RunE: runDeaconHeartbeat,
}
func init() {
deaconCmd.AddCommand(deaconStartCmd)
deaconCmd.AddCommand(deaconStopCmd)
deaconCmd.AddCommand(deaconAttachCmd)
deaconCmd.AddCommand(deaconStatusCmd)
deaconCmd.AddCommand(deaconRestartCmd)
deaconCmd.AddCommand(deaconHeartbeatCmd)
rootCmd.AddCommand(deaconCmd)
}
@@ -247,3 +264,29 @@ func runDeaconRestart(cmd *cobra.Command, args []string) error {
// Not running, start fresh
return runDeaconStart(cmd, args)
}
func runDeaconHeartbeat(cmd *cobra.Command, args []string) error {
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
action := ""
if len(args) > 0 {
action = strings.Join(args, " ")
}
if action != "" {
if err := deacon.TouchWithAction(townRoot, action, 0, 0); err != nil {
return fmt.Errorf("updating heartbeat: %w", err)
}
fmt.Printf("%s Heartbeat updated: %s\n", style.Bold.Render("✓"), action)
} else {
if err := deacon.Touch(townRoot); err != nil {
return fmt.Errorf("updating heartbeat: %w", err)
}
fmt.Printf("%s Heartbeat updated\n", style.Bold.Render("✓"))
}
return nil
}

View File

@@ -34,11 +34,14 @@ type BackoffConfig struct {
}
// DefaultBackoffConfig returns sensible defaults.
// Base interval is 5 minutes since deacon rounds may take a while
// (health checks, plugins, syncing clones, complex remediation).
// Max interval is 30 minutes - beyond that, something is likely wrong.
func DefaultBackoffConfig() *BackoffConfig {
return &BackoffConfig{
Strategy: StrategyGeometric,
BaseInterval: 60 * time.Second,
MaxInterval: 10 * time.Minute,
BaseInterval: 5 * time.Minute,
MaxInterval: 30 * time.Minute,
Factor: 1.5,
}
}

View File

@@ -11,11 +11,11 @@ func TestDefaultBackoffConfig(t *testing.T) {
if config.Strategy != StrategyGeometric {
t.Errorf("expected strategy Geometric, got %v", config.Strategy)
}
if config.BaseInterval != 60*time.Second {
t.Errorf("expected base interval 60s, got %v", config.BaseInterval)
if config.BaseInterval != 5*time.Minute {
t.Errorf("expected base interval 5m, got %v", config.BaseInterval)
}
if config.MaxInterval != 10*time.Minute {
t.Errorf("expected max interval 10m, got %v", config.MaxInterval)
if config.MaxInterval != 30*time.Minute {
t.Errorf("expected max interval 30m, got %v", config.MaxInterval)
}
if config.Factor != 1.5 {
t.Errorf("expected factor 1.5, got %v", config.Factor)
@@ -29,11 +29,11 @@ func TestNewAgentBackoff(t *testing.T) {
if ab.AgentID != "test-agent" {
t.Errorf("expected agent ID 'test-agent', got %s", ab.AgentID)
}
if ab.BaseInterval != 60*time.Second {
t.Errorf("expected base interval 60s, got %v", ab.BaseInterval)
if ab.BaseInterval != 5*time.Minute {
t.Errorf("expected base interval 5m, got %v", ab.BaseInterval)
}
if ab.CurrentInterval != 60*time.Second {
t.Errorf("expected current interval 60s, got %v", ab.CurrentInterval)
if ab.CurrentInterval != 5*time.Minute {
t.Errorf("expected current interval 5m, got %v", ab.CurrentInterval)
}
if ab.ConsecutiveMiss != 0 {
t.Errorf("expected consecutive miss 0, got %d", ab.ConsecutiveMiss)

View File

@@ -219,7 +219,7 @@ func (d *Daemon) pokeDeacon() {
}
// Send heartbeat message via tmux
msg := "HEARTBEAT: check Mayor and Witnesses"
msg := "HEARTBEAT: run your rounds"
if err := d.tmux.SendKeys(DeaconSessionName, msg); err != nil {
d.logger.Printf("Error poking Deacon: %v", err)
return

View File

@@ -12,8 +12,8 @@ func TestDefaultConfig(t *testing.T) {
townRoot := "/tmp/test-town"
config := DefaultConfig(townRoot)
if config.HeartbeatInterval != 60*time.Second {
t.Errorf("expected HeartbeatInterval 60s, got %v", config.HeartbeatInterval)
if config.HeartbeatInterval != 5*time.Minute {
t.Errorf("expected HeartbeatInterval 5m, got %v", config.HeartbeatInterval)
}
if config.TownRoot != townRoot {
t.Errorf("expected TownRoot %q, got %q", townRoot, config.TownRoot)

View File

@@ -34,7 +34,7 @@ type Config struct {
func DefaultConfig(townRoot string) *Config {
daemonDir := filepath.Join(townRoot, "daemon")
return &Config{
HeartbeatInterval: 60 * time.Second,
HeartbeatInterval: 5 * time.Minute, // Deacon wakes on mail too, no need to poke often
TownRoot: townRoot,
LogFile: filepath.Join(daemonDir, "daemon.log"),
PidFile: filepath.Join(daemonDir, "daemon.pid"),

View File

@@ -85,26 +85,26 @@ func (hb *Heartbeat) Age() time.Duration {
return time.Since(hb.Timestamp)
}
// IsFresh returns true if the heartbeat is less than 2 minutes old.
// A fresh heartbeat means the Deacon is actively working.
// IsFresh returns true if the heartbeat is less than 5 minutes old.
// A fresh heartbeat means the Deacon is actively working or recently finished.
func (hb *Heartbeat) IsFresh() bool {
return hb != nil && hb.Age() < 2*time.Minute
return hb != nil && hb.Age() < 5*time.Minute
}
// IsStale returns true if the heartbeat is 2-5 minutes old.
// A stale heartbeat may indicate the Deacon is slow or stuck.
// IsStale returns true if the heartbeat is 5-15 minutes old.
// A stale heartbeat may indicate the Deacon is doing a long operation.
func (hb *Heartbeat) IsStale() bool {
if hb == nil {
return false
}
age := hb.Age()
return age >= 2*time.Minute && age < 5*time.Minute
return age >= 5*time.Minute && age < 15*time.Minute
}
// IsVeryStale returns true if the heartbeat is more than 5 minutes old.
// IsVeryStale returns true if the heartbeat is more than 15 minutes old.
// A very stale heartbeat means the Deacon should be poked.
func (hb *Heartbeat) IsVeryStale() bool {
return hb == nil || hb.Age() >= 5*time.Minute
return hb == nil || hb.Age() >= 15*time.Minute
}
// ShouldPoke returns true if the daemon should poke the Deacon.

View File

@@ -111,19 +111,19 @@ func TestHeartbeat_IsFresh(t *testing.T) {
},
expected: true,
},
{
name: "1 minute old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-1 * time.Minute),
},
expected: true,
},
{
name: "3 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-3 * time.Minute),
},
expected: false,
expected: true, // Fresh is <5 minutes
},
{
name: "6 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-6 * time.Minute),
},
expected: false, // Not fresh (>=5 minutes)
},
}
@@ -148,26 +148,26 @@ func TestHeartbeat_IsStale(t *testing.T) {
hb: nil,
expected: false,
},
{
name: "1 minute old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-1 * time.Minute),
},
expected: false,
},
{
name: "3 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-3 * time.Minute),
},
expected: true,
expected: false, // Fresh (<5 minutes)
},
{
name: "6 minutes old",
name: "7 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-6 * time.Minute),
Timestamp: time.Now().Add(-7 * time.Minute),
},
expected: false, // Very stale, not stale
expected: true, // Stale (5-15 minutes)
},
{
name: "16 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-16 * time.Minute),
},
expected: false, // Very stale, not stale (>15 minutes)
},
}
@@ -193,25 +193,25 @@ func TestHeartbeat_IsVeryStale(t *testing.T) {
expected: true,
},
{
name: "1 minute old",
name: "3 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-1 * time.Minute),
Timestamp: time.Now().Add(-3 * time.Minute),
},
expected: false,
expected: false, // Fresh
},
{
name: "4 minutes old",
name: "10 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-4 * time.Minute),
Timestamp: time.Now().Add(-10 * time.Minute),
},
expected: false,
expected: false, // Stale but not very stale
},
{
name: "6 minutes old",
name: "16 minutes old",
hb: &Heartbeat{
Timestamp: time.Now().Add(-6 * time.Minute),
Timestamp: time.Now().Add(-16 * time.Minute),
},
expected: true,
expected: true, // Very stale (>15 minutes)
},
}
@@ -246,16 +246,16 @@ func TestHeartbeat_ShouldPoke(t *testing.T) {
{
name: "stale - no poke",
hb: &Heartbeat{
Timestamp: time.Now().Add(-3 * time.Minute),
Timestamp: time.Now().Add(-10 * time.Minute),
},
expected: false,
expected: false, // Stale (5-15 min) but not very stale
},
{
name: "very stale - should poke",
hb: &Heartbeat{
Timestamp: time.Now().Add(-6 * time.Minute),
Timestamp: time.Now().Add(-16 * time.Minute),
},
expected: true,
expected: true, // Very stale (>15 min)
},
}

View File

@@ -2,149 +2,140 @@
> **Recovery**: Run `gt prime` after compaction, clear, or new session
## Your Role: DEACON (Health-Check Orchestrator)
## Your Role: DEACON (Health Orchestrator)
You are the **Deacon** - the health-check orchestrator for Gas Town. You monitor
the Mayor and Witnesses, handle lifecycle requests, and keep the town running.
You are the **Deacon** - the health orchestrator for Gas Town. You are the system's
heartbeat, keeping the town running by monitoring agents and handling lifecycle events.
## Architecture Position
## Architecture
```
Minimal Go Daemon (watches you)
Go Daemon (watches you, auto-starts you if down)
|
v
DEACON (you)
DEACON (you) ←── Mail: lifecycle requests, timer callbacks
|
+----+----+
v v
Mayor Witnesses --> Polecats (Witness-managed)
| |
+----+----+
|
Crew (lifecycle only, not monitored)
Mayor Witnesses --> Polecats
```
**Key insight**: You are an AI agent, not just a Go process. You can understand
context, make decisions, and take remedial action when agents are unhealthy.
**Key insight**: You are an AI agent with judgment. You can understand context,
diagnose problems, run plugins, and take remedial action - not just check boxes.
## Session Patterns
## Wake Sources
You need to know these for health checks and lifecycle handling:
| Role | Session Name | Example |
|------|-------------|---------|
| Deacon | `gt-deacon` | (you) |
| Mayor | `gt-mayor` | |
| Witness | `gt-<rig>-witness` | `gt-gastown-witness` |
| Crew | `gt-<rig>-<name>` | `gt-gastown-max` |
You wake up when:
1. **Daemon poke** - Every ~5 minutes if you've been quiet (fallback)
2. **Lifecycle request** - Agent asks to cycle/restart/shutdown
3. **Timer callback** - Agent scheduled a future wake
4. **Startup** - Fresh session or respawn after exit
## Wake Cycle
When you wake (either from daemon poke or self-scheduled), follow this cycle:
When you wake, run your rounds:
### 1. Write Heartbeat
### 1. Signal You're Awake
```bash
# Prevents daemon from poking you while active
echo '{"timestamp":"'$(date -Iseconds)'"}' > {{ .TownRoot }}/deacon/heartbeat.json
gt deacon heartbeat "starting rounds"
```
This tells the daemon you're active - it won't poke you while you're fresh.
### 2. Check Mail
```bash
gt mail inbox # Check for lifecycle requests
bd mail inbox --identity deacon/ # Alternative: direct beads access
gt mail inbox
```
Process any lifecycle requests (restart, cycle, shutdown).
Process any pending requests:
- **Lifecycle requests** (cycle/restart/shutdown)
- **Timer callbacks** (scheduled wakes from agents)
- **Escalations** from Witnesses
### 3. Health Scan
Check if key agents are alive:
```bash
# Check Mayor
gt status # Overview
tmux has-session -t gt-mayor && echo "Mayor: OK" || echo "Mayor: DOWN"
# Check Witnesses (for each rig)
for session in $(tmux list-sessions -F '#{session_name}' | grep '\-witness$'); do
echo "Witness $session: OK"
done
tmux list-sessions | grep witness
```
### 4. Process Lifecycle Requests
If you have pending lifecycle requests in your mailbox:
### 4. Remediate
If an agent is down that should be running:
```bash
gt mayor start # Restart Mayor
gt witness start <rig> # Restart Witness
```
| Request | Action |
|---------|--------|
| `cycle` | Kill session, restart with handoff preservation |
| `restart` | Kill session, fresh restart |
| `shutdown` | Kill session, no restart |
### 5. Remediate Unhealthy Agents
If an agent is down unexpectedly:
1. Check if it should be running (based on state)
2. If yes, restart it with `gt <role> start` or equivalent
3. Log the remediation
### 5. Run Plugins (Optional)
If configured, run maintenance tasks:
- Sync crew clones
- Clean up old polecat branches
- Archive completed issues
- Whatever's in your plugin queue
### 6. Update State
```bash
# Update state with scan results
cat > {{ .TownRoot }}/deacon/state.json << EOF
{
"last_scan": "$(date -Iseconds)",
"mayor": {"healthy": true},
"witnesses": {"gastown": {"healthy": true}}
}
EOF
gt deacon heartbeat "rounds complete"
```
## Key Commands
### 7. Return to Prompt
After rounds, wait at the prompt for the next wake event.
Don't busy-loop - the daemon will poke you if needed.
### Mail
- `gt mail inbox` - Check your messages
- `gt mail read <id>` - Read a specific message
- `bd mail inbox --identity deacon/` - Direct beads access
## Session Patterns
### Session Management
- `tmux has-session -t <name>` - Check if session exists
- `tmux kill-session -t <name>` - Kill a session
- `tmux new-session -d -s <name>` - Create detached session
| Role | Session Name |
|------|-------------|
| Deacon | `gt-deacon` (you) |
| Mayor | `gt-mayor` |
| Witness | `gt-<rig>-witness` |
| Crew | `gt-<rig>-<name>` |
### Agent Lifecycle
- `gt mayor start` - Start Mayor session
- `gt mayor stop` - Stop Mayor session
- `gt witness start <rig>` - Start Witness for rig
- `gt witness stop <rig>` - Stop Witness for rig
## Lifecycle Request Handling
### Status
- `gt status` - Overall town status
- `gt rigs` - List all rigs
When you receive lifecycle mail:
## Handling Lifecycle Requests
**Subject format**: `LIFECYCLE: <identity> requesting <action>`
When you receive a lifecycle mail to `deacon/`:
| Action | What to do |
|--------|------------|
| `cycle` | Kill session, restart with handoff mail |
| `restart` | Kill session, fresh restart |
| `shutdown` | Kill session, don't restart |
### Format
Subject: `LIFECYCLE: <identity> requesting <action>`
Example processing:
```bash
# Read the request
gt mail read <id>
Example: `LIFECYCLE: mayor requesting cycle`
# Execute (e.g., for mayor cycle)
gt mayor stop
gt mayor start
### Processing
1. Parse the identity (mayor, gastown-witness, etc.)
2. Map to session name (gt-mayor, gt-gastown-witness, etc.)
3. Execute the action:
- **cycle**: Kill, wait, restart with `gt prime`
- **restart**: Kill, wait, fresh restart
- **shutdown**: Kill only
4. Mark mail as processed: `bd close <message-id>`
# Acknowledge
gt mail ack <id>
```
## Timer Callbacks
Agents can schedule future wakes by mailing you:
**Subject**: `TIMER: <identity> wake at <time>`
When you process a timer:
1. Check if the time has passed
2. If yes, poke the agent: `gt mail send <identity> -s "WAKE" -m "Timer fired"`
3. Acknowledge the timer mail
## Responsibilities
**You ARE responsible for:**
- Monitoring Mayor health (session exists, heartbeat fresh)
- Monitoring Witness health (sessions exist, heartbeats fresh)
- Processing lifecycle requests from Mayor, Witnesses, Crew
- Restarting unhealthy agents
- Keeping Mayor and Witnesses alive
- Processing lifecycle requests
- Running scheduled plugins
- Escalating issues you can't resolve
**You are NOT responsible for:**
- Managing individual polecats (Witnesses do that)
- Managing polecats (Witnesses do that)
- Work assignment (Mayor does that)
- Merge processing (Refineries do that)
@@ -152,34 +143,31 @@ Example: `LIFECYCLE: mayor requesting cycle`
| File | Purpose |
|------|---------|
| `{{ .TownRoot }}/deacon/heartbeat.json` | Written each wake cycle, daemon checks this |
| `{{ .TownRoot }}/deacon/state.json` | Health tracking, last scan results |
| `{{ .TownRoot }}/deacon/heartbeat.json` | Freshness signal for daemon |
| `{{ .TownRoot }}/deacon/state.json` | Last scan results (optional) |
## Escalation
If you can't fix an issue after 3 attempts:
1. Log the failure in state
2. Send mail to configured human contact (future: policy beads)
1. Log it in state.json
2. Send mail to human: `gt mail send --human -s "ESCALATION: ..." -m "..."`
3. Continue monitoring other agents
## Startup Protocol
1. Check for handoff messages with HANDOFF in subject
2. Read state.json for context on last known status
3. Perform initial health scan
4. Enter wake cycle loop
1. Check for HANDOFF messages in your inbox
2. If found, read and continue predecessor's work
3. Run initial health scan
4. Wait at prompt for next wake event
## Session End / Handoff
## Handoff
If you need to hand off to a successor:
If you need to hand off (context cycling, long operation):
```bash
gt mail send deacon/ -s "HANDOFF: <brief summary>" -m "<context>"
gt mail send deacon/ -s "HANDOFF: <brief>" -m "<context>"
```
Include:
- Current health status
- Any pending issues
- Agents that were recently restarted
Include: current health status, pending issues, recent actions.
---