feat(deacon): improve timing and add heartbeat command
Timing changes for more relaxed poke intervals: - Daemon heartbeat: 60s → 5 minutes - Backoff base: 60s → 5 minutes - Backoff max: 10m → 30 minutes - Fresh threshold: <2min → <5min - Stale threshold: 2-5min → 5-15min - Very stale threshold: >5min → >15min New command: - `gt deacon heartbeat [action]` - Touch heartbeat file easily Template rewrite: - Clearer wake/sleep model - Documents wake sources (daemon poke, mail, timer callbacks) - Simpler rounds with `gt deacon heartbeat` instead of bash echo - Mentions plugins as optional maintenance tasks - Explains timer callbacks pattern 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -4,9 +4,11 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
|
"github.com/steveyegge/gastown/internal/deacon"
|
||||||
"github.com/steveyegge/gastown/internal/style"
|
"github.com/steveyegge/gastown/internal/style"
|
||||||
"github.com/steveyegge/gastown/internal/tmux"
|
"github.com/steveyegge/gastown/internal/tmux"
|
||||||
"github.com/steveyegge/gastown/internal/workspace"
|
"github.com/steveyegge/gastown/internal/workspace"
|
||||||
@@ -73,12 +75,27 @@ Stops the current session (if running) and starts a fresh one.`,
|
|||||||
RunE: runDeaconRestart,
|
RunE: runDeaconRestart,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var deaconHeartbeatCmd = &cobra.Command{
|
||||||
|
Use: "heartbeat [action]",
|
||||||
|
Short: "Update the Deacon heartbeat",
|
||||||
|
Long: `Update the Deacon heartbeat file.
|
||||||
|
|
||||||
|
The heartbeat signals to the daemon that the Deacon is alive and working.
|
||||||
|
Call this at the start of each wake cycle to prevent daemon pokes.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
gt deacon heartbeat # Touch heartbeat with timestamp
|
||||||
|
gt deacon heartbeat "checking mayor" # Touch with action description`,
|
||||||
|
RunE: runDeaconHeartbeat,
|
||||||
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
deaconCmd.AddCommand(deaconStartCmd)
|
deaconCmd.AddCommand(deaconStartCmd)
|
||||||
deaconCmd.AddCommand(deaconStopCmd)
|
deaconCmd.AddCommand(deaconStopCmd)
|
||||||
deaconCmd.AddCommand(deaconAttachCmd)
|
deaconCmd.AddCommand(deaconAttachCmd)
|
||||||
deaconCmd.AddCommand(deaconStatusCmd)
|
deaconCmd.AddCommand(deaconStatusCmd)
|
||||||
deaconCmd.AddCommand(deaconRestartCmd)
|
deaconCmd.AddCommand(deaconRestartCmd)
|
||||||
|
deaconCmd.AddCommand(deaconHeartbeatCmd)
|
||||||
|
|
||||||
rootCmd.AddCommand(deaconCmd)
|
rootCmd.AddCommand(deaconCmd)
|
||||||
}
|
}
|
||||||
@@ -247,3 +264,29 @@ func runDeaconRestart(cmd *cobra.Command, args []string) error {
|
|||||||
// Not running, start fresh
|
// Not running, start fresh
|
||||||
return runDeaconStart(cmd, args)
|
return runDeaconStart(cmd, args)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runDeaconHeartbeat(cmd *cobra.Command, args []string) error {
|
||||||
|
townRoot, err := workspace.FindFromCwdOrError()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
action := ""
|
||||||
|
if len(args) > 0 {
|
||||||
|
action = strings.Join(args, " ")
|
||||||
|
}
|
||||||
|
|
||||||
|
if action != "" {
|
||||||
|
if err := deacon.TouchWithAction(townRoot, action, 0, 0); err != nil {
|
||||||
|
return fmt.Errorf("updating heartbeat: %w", err)
|
||||||
|
}
|
||||||
|
fmt.Printf("%s Heartbeat updated: %s\n", style.Bold.Render("✓"), action)
|
||||||
|
} else {
|
||||||
|
if err := deacon.Touch(townRoot); err != nil {
|
||||||
|
return fmt.Errorf("updating heartbeat: %w", err)
|
||||||
|
}
|
||||||
|
fmt.Printf("%s Heartbeat updated\n", style.Bold.Render("✓"))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -34,11 +34,14 @@ type BackoffConfig struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// DefaultBackoffConfig returns sensible defaults.
|
// DefaultBackoffConfig returns sensible defaults.
|
||||||
|
// Base interval is 5 minutes since deacon rounds may take a while
|
||||||
|
// (health checks, plugins, syncing clones, complex remediation).
|
||||||
|
// Max interval is 30 minutes - beyond that, something is likely wrong.
|
||||||
func DefaultBackoffConfig() *BackoffConfig {
|
func DefaultBackoffConfig() *BackoffConfig {
|
||||||
return &BackoffConfig{
|
return &BackoffConfig{
|
||||||
Strategy: StrategyGeometric,
|
Strategy: StrategyGeometric,
|
||||||
BaseInterval: 60 * time.Second,
|
BaseInterval: 5 * time.Minute,
|
||||||
MaxInterval: 10 * time.Minute,
|
MaxInterval: 30 * time.Minute,
|
||||||
Factor: 1.5,
|
Factor: 1.5,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,11 +11,11 @@ func TestDefaultBackoffConfig(t *testing.T) {
|
|||||||
if config.Strategy != StrategyGeometric {
|
if config.Strategy != StrategyGeometric {
|
||||||
t.Errorf("expected strategy Geometric, got %v", config.Strategy)
|
t.Errorf("expected strategy Geometric, got %v", config.Strategy)
|
||||||
}
|
}
|
||||||
if config.BaseInterval != 60*time.Second {
|
if config.BaseInterval != 5*time.Minute {
|
||||||
t.Errorf("expected base interval 60s, got %v", config.BaseInterval)
|
t.Errorf("expected base interval 5m, got %v", config.BaseInterval)
|
||||||
}
|
}
|
||||||
if config.MaxInterval != 10*time.Minute {
|
if config.MaxInterval != 30*time.Minute {
|
||||||
t.Errorf("expected max interval 10m, got %v", config.MaxInterval)
|
t.Errorf("expected max interval 30m, got %v", config.MaxInterval)
|
||||||
}
|
}
|
||||||
if config.Factor != 1.5 {
|
if config.Factor != 1.5 {
|
||||||
t.Errorf("expected factor 1.5, got %v", config.Factor)
|
t.Errorf("expected factor 1.5, got %v", config.Factor)
|
||||||
@@ -29,11 +29,11 @@ func TestNewAgentBackoff(t *testing.T) {
|
|||||||
if ab.AgentID != "test-agent" {
|
if ab.AgentID != "test-agent" {
|
||||||
t.Errorf("expected agent ID 'test-agent', got %s", ab.AgentID)
|
t.Errorf("expected agent ID 'test-agent', got %s", ab.AgentID)
|
||||||
}
|
}
|
||||||
if ab.BaseInterval != 60*time.Second {
|
if ab.BaseInterval != 5*time.Minute {
|
||||||
t.Errorf("expected base interval 60s, got %v", ab.BaseInterval)
|
t.Errorf("expected base interval 5m, got %v", ab.BaseInterval)
|
||||||
}
|
}
|
||||||
if ab.CurrentInterval != 60*time.Second {
|
if ab.CurrentInterval != 5*time.Minute {
|
||||||
t.Errorf("expected current interval 60s, got %v", ab.CurrentInterval)
|
t.Errorf("expected current interval 5m, got %v", ab.CurrentInterval)
|
||||||
}
|
}
|
||||||
if ab.ConsecutiveMiss != 0 {
|
if ab.ConsecutiveMiss != 0 {
|
||||||
t.Errorf("expected consecutive miss 0, got %d", ab.ConsecutiveMiss)
|
t.Errorf("expected consecutive miss 0, got %d", ab.ConsecutiveMiss)
|
||||||
|
|||||||
@@ -219,7 +219,7 @@ func (d *Daemon) pokeDeacon() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Send heartbeat message via tmux
|
// Send heartbeat message via tmux
|
||||||
msg := "HEARTBEAT: check Mayor and Witnesses"
|
msg := "HEARTBEAT: run your rounds"
|
||||||
if err := d.tmux.SendKeys(DeaconSessionName, msg); err != nil {
|
if err := d.tmux.SendKeys(DeaconSessionName, msg); err != nil {
|
||||||
d.logger.Printf("Error poking Deacon: %v", err)
|
d.logger.Printf("Error poking Deacon: %v", err)
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ func TestDefaultConfig(t *testing.T) {
|
|||||||
townRoot := "/tmp/test-town"
|
townRoot := "/tmp/test-town"
|
||||||
config := DefaultConfig(townRoot)
|
config := DefaultConfig(townRoot)
|
||||||
|
|
||||||
if config.HeartbeatInterval != 60*time.Second {
|
if config.HeartbeatInterval != 5*time.Minute {
|
||||||
t.Errorf("expected HeartbeatInterval 60s, got %v", config.HeartbeatInterval)
|
t.Errorf("expected HeartbeatInterval 5m, got %v", config.HeartbeatInterval)
|
||||||
}
|
}
|
||||||
if config.TownRoot != townRoot {
|
if config.TownRoot != townRoot {
|
||||||
t.Errorf("expected TownRoot %q, got %q", townRoot, config.TownRoot)
|
t.Errorf("expected TownRoot %q, got %q", townRoot, config.TownRoot)
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ type Config struct {
|
|||||||
func DefaultConfig(townRoot string) *Config {
|
func DefaultConfig(townRoot string) *Config {
|
||||||
daemonDir := filepath.Join(townRoot, "daemon")
|
daemonDir := filepath.Join(townRoot, "daemon")
|
||||||
return &Config{
|
return &Config{
|
||||||
HeartbeatInterval: 60 * time.Second,
|
HeartbeatInterval: 5 * time.Minute, // Deacon wakes on mail too, no need to poke often
|
||||||
TownRoot: townRoot,
|
TownRoot: townRoot,
|
||||||
LogFile: filepath.Join(daemonDir, "daemon.log"),
|
LogFile: filepath.Join(daemonDir, "daemon.log"),
|
||||||
PidFile: filepath.Join(daemonDir, "daemon.pid"),
|
PidFile: filepath.Join(daemonDir, "daemon.pid"),
|
||||||
|
|||||||
@@ -85,26 +85,26 @@ func (hb *Heartbeat) Age() time.Duration {
|
|||||||
return time.Since(hb.Timestamp)
|
return time.Since(hb.Timestamp)
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsFresh returns true if the heartbeat is less than 2 minutes old.
|
// IsFresh returns true if the heartbeat is less than 5 minutes old.
|
||||||
// A fresh heartbeat means the Deacon is actively working.
|
// A fresh heartbeat means the Deacon is actively working or recently finished.
|
||||||
func (hb *Heartbeat) IsFresh() bool {
|
func (hb *Heartbeat) IsFresh() bool {
|
||||||
return hb != nil && hb.Age() < 2*time.Minute
|
return hb != nil && hb.Age() < 5*time.Minute
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsStale returns true if the heartbeat is 2-5 minutes old.
|
// IsStale returns true if the heartbeat is 5-15 minutes old.
|
||||||
// A stale heartbeat may indicate the Deacon is slow or stuck.
|
// A stale heartbeat may indicate the Deacon is doing a long operation.
|
||||||
func (hb *Heartbeat) IsStale() bool {
|
func (hb *Heartbeat) IsStale() bool {
|
||||||
if hb == nil {
|
if hb == nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
age := hb.Age()
|
age := hb.Age()
|
||||||
return age >= 2*time.Minute && age < 5*time.Minute
|
return age >= 5*time.Minute && age < 15*time.Minute
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsVeryStale returns true if the heartbeat is more than 5 minutes old.
|
// IsVeryStale returns true if the heartbeat is more than 15 minutes old.
|
||||||
// A very stale heartbeat means the Deacon should be poked.
|
// A very stale heartbeat means the Deacon should be poked.
|
||||||
func (hb *Heartbeat) IsVeryStale() bool {
|
func (hb *Heartbeat) IsVeryStale() bool {
|
||||||
return hb == nil || hb.Age() >= 5*time.Minute
|
return hb == nil || hb.Age() >= 15*time.Minute
|
||||||
}
|
}
|
||||||
|
|
||||||
// ShouldPoke returns true if the daemon should poke the Deacon.
|
// ShouldPoke returns true if the daemon should poke the Deacon.
|
||||||
|
|||||||
@@ -111,19 +111,19 @@ func TestHeartbeat_IsFresh(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expected: true,
|
expected: true,
|
||||||
},
|
},
|
||||||
{
|
|
||||||
name: "1 minute old",
|
|
||||||
hb: &Heartbeat{
|
|
||||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
|
||||||
},
|
|
||||||
expected: true,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
name: "3 minutes old",
|
name: "3 minutes old",
|
||||||
hb: &Heartbeat{
|
hb: &Heartbeat{
|
||||||
Timestamp: time.Now().Add(-3 * time.Minute),
|
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||||
},
|
},
|
||||||
expected: false,
|
expected: true, // Fresh is <5 minutes
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "6 minutes old",
|
||||||
|
hb: &Heartbeat{
|
||||||
|
Timestamp: time.Now().Add(-6 * time.Minute),
|
||||||
|
},
|
||||||
|
expected: false, // Not fresh (>=5 minutes)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -148,26 +148,26 @@ func TestHeartbeat_IsStale(t *testing.T) {
|
|||||||
hb: nil,
|
hb: nil,
|
||||||
expected: false,
|
expected: false,
|
||||||
},
|
},
|
||||||
{
|
|
||||||
name: "1 minute old",
|
|
||||||
hb: &Heartbeat{
|
|
||||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
|
||||||
},
|
|
||||||
expected: false,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
name: "3 minutes old",
|
name: "3 minutes old",
|
||||||
hb: &Heartbeat{
|
hb: &Heartbeat{
|
||||||
Timestamp: time.Now().Add(-3 * time.Minute),
|
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||||
},
|
},
|
||||||
expected: true,
|
expected: false, // Fresh (<5 minutes)
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "6 minutes old",
|
name: "7 minutes old",
|
||||||
hb: &Heartbeat{
|
hb: &Heartbeat{
|
||||||
Timestamp: time.Now().Add(-6 * time.Minute),
|
Timestamp: time.Now().Add(-7 * time.Minute),
|
||||||
},
|
},
|
||||||
expected: false, // Very stale, not stale
|
expected: true, // Stale (5-15 minutes)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "16 minutes old",
|
||||||
|
hb: &Heartbeat{
|
||||||
|
Timestamp: time.Now().Add(-16 * time.Minute),
|
||||||
|
},
|
||||||
|
expected: false, // Very stale, not stale (>15 minutes)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -193,25 +193,25 @@ func TestHeartbeat_IsVeryStale(t *testing.T) {
|
|||||||
expected: true,
|
expected: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "1 minute old",
|
name: "3 minutes old",
|
||||||
hb: &Heartbeat{
|
hb: &Heartbeat{
|
||||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||||
},
|
},
|
||||||
expected: false,
|
expected: false, // Fresh
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "4 minutes old",
|
name: "10 minutes old",
|
||||||
hb: &Heartbeat{
|
hb: &Heartbeat{
|
||||||
Timestamp: time.Now().Add(-4 * time.Minute),
|
Timestamp: time.Now().Add(-10 * time.Minute),
|
||||||
},
|
},
|
||||||
expected: false,
|
expected: false, // Stale but not very stale
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "6 minutes old",
|
name: "16 minutes old",
|
||||||
hb: &Heartbeat{
|
hb: &Heartbeat{
|
||||||
Timestamp: time.Now().Add(-6 * time.Minute),
|
Timestamp: time.Now().Add(-16 * time.Minute),
|
||||||
},
|
},
|
||||||
expected: true,
|
expected: true, // Very stale (>15 minutes)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -246,16 +246,16 @@ func TestHeartbeat_ShouldPoke(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "stale - no poke",
|
name: "stale - no poke",
|
||||||
hb: &Heartbeat{
|
hb: &Heartbeat{
|
||||||
Timestamp: time.Now().Add(-3 * time.Minute),
|
Timestamp: time.Now().Add(-10 * time.Minute),
|
||||||
},
|
},
|
||||||
expected: false,
|
expected: false, // Stale (5-15 min) but not very stale
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "very stale - should poke",
|
name: "very stale - should poke",
|
||||||
hb: &Heartbeat{
|
hb: &Heartbeat{
|
||||||
Timestamp: time.Now().Add(-6 * time.Minute),
|
Timestamp: time.Now().Add(-16 * time.Minute),
|
||||||
},
|
},
|
||||||
expected: true,
|
expected: true, // Very stale (>15 min)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,149 +2,140 @@
|
|||||||
|
|
||||||
> **Recovery**: Run `gt prime` after compaction, clear, or new session
|
> **Recovery**: Run `gt prime` after compaction, clear, or new session
|
||||||
|
|
||||||
## Your Role: DEACON (Health-Check Orchestrator)
|
## Your Role: DEACON (Health Orchestrator)
|
||||||
|
|
||||||
You are the **Deacon** - the health-check orchestrator for Gas Town. You monitor
|
You are the **Deacon** - the health orchestrator for Gas Town. You are the system's
|
||||||
the Mayor and Witnesses, handle lifecycle requests, and keep the town running.
|
heartbeat, keeping the town running by monitoring agents and handling lifecycle events.
|
||||||
|
|
||||||
## Architecture Position
|
## Architecture
|
||||||
|
|
||||||
```
|
```
|
||||||
Minimal Go Daemon (watches you)
|
Go Daemon (watches you, auto-starts you if down)
|
||||||
|
|
|
|
||||||
v
|
v
|
||||||
DEACON (you)
|
DEACON (you) ←── Mail: lifecycle requests, timer callbacks
|
||||||
|
|
|
|
||||||
+----+----+
|
+----+----+
|
||||||
v v
|
v v
|
||||||
Mayor Witnesses --> Polecats (Witness-managed)
|
Mayor Witnesses --> Polecats
|
||||||
| |
|
|
||||||
+----+----+
|
|
||||||
|
|
|
||||||
Crew (lifecycle only, not monitored)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Key insight**: You are an AI agent, not just a Go process. You can understand
|
**Key insight**: You are an AI agent with judgment. You can understand context,
|
||||||
context, make decisions, and take remedial action when agents are unhealthy.
|
diagnose problems, run plugins, and take remedial action - not just check boxes.
|
||||||
|
|
||||||
## Session Patterns
|
## Wake Sources
|
||||||
|
|
||||||
You need to know these for health checks and lifecycle handling:
|
You wake up when:
|
||||||
|
1. **Daemon poke** - Every ~5 minutes if you've been quiet (fallback)
|
||||||
| Role | Session Name | Example |
|
2. **Lifecycle request** - Agent asks to cycle/restart/shutdown
|
||||||
|------|-------------|---------|
|
3. **Timer callback** - Agent scheduled a future wake
|
||||||
| Deacon | `gt-deacon` | (you) |
|
4. **Startup** - Fresh session or respawn after exit
|
||||||
| Mayor | `gt-mayor` | |
|
|
||||||
| Witness | `gt-<rig>-witness` | `gt-gastown-witness` |
|
|
||||||
| Crew | `gt-<rig>-<name>` | `gt-gastown-max` |
|
|
||||||
|
|
||||||
## Wake Cycle
|
## Wake Cycle
|
||||||
|
|
||||||
When you wake (either from daemon poke or self-scheduled), follow this cycle:
|
When you wake, run your rounds:
|
||||||
|
|
||||||
### 1. Write Heartbeat
|
### 1. Signal You're Awake
|
||||||
```bash
|
```bash
|
||||||
# Prevents daemon from poking you while active
|
gt deacon heartbeat "starting rounds"
|
||||||
echo '{"timestamp":"'$(date -Iseconds)'"}' > {{ .TownRoot }}/deacon/heartbeat.json
|
|
||||||
```
|
```
|
||||||
|
This tells the daemon you're active - it won't poke you while you're fresh.
|
||||||
|
|
||||||
### 2. Check Mail
|
### 2. Check Mail
|
||||||
```bash
|
```bash
|
||||||
gt mail inbox # Check for lifecycle requests
|
gt mail inbox
|
||||||
bd mail inbox --identity deacon/ # Alternative: direct beads access
|
|
||||||
```
|
```
|
||||||
|
Process any pending requests:
|
||||||
Process any lifecycle requests (restart, cycle, shutdown).
|
- **Lifecycle requests** (cycle/restart/shutdown)
|
||||||
|
- **Timer callbacks** (scheduled wakes from agents)
|
||||||
|
- **Escalations** from Witnesses
|
||||||
|
|
||||||
### 3. Health Scan
|
### 3. Health Scan
|
||||||
|
Check if key agents are alive:
|
||||||
```bash
|
```bash
|
||||||
# Check Mayor
|
gt status # Overview
|
||||||
tmux has-session -t gt-mayor && echo "Mayor: OK" || echo "Mayor: DOWN"
|
tmux has-session -t gt-mayor && echo "Mayor: OK" || echo "Mayor: DOWN"
|
||||||
|
tmux list-sessions | grep witness
|
||||||
# Check Witnesses (for each rig)
|
|
||||||
for session in $(tmux list-sessions -F '#{session_name}' | grep '\-witness$'); do
|
|
||||||
echo "Witness $session: OK"
|
|
||||||
done
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. Process Lifecycle Requests
|
### 4. Remediate
|
||||||
If you have pending lifecycle requests in your mailbox:
|
If an agent is down that should be running:
|
||||||
|
```bash
|
||||||
|
gt mayor start # Restart Mayor
|
||||||
|
gt witness start <rig> # Restart Witness
|
||||||
|
```
|
||||||
|
|
||||||
| Request | Action |
|
### 5. Run Plugins (Optional)
|
||||||
|---------|--------|
|
If configured, run maintenance tasks:
|
||||||
| `cycle` | Kill session, restart with handoff preservation |
|
- Sync crew clones
|
||||||
| `restart` | Kill session, fresh restart |
|
- Clean up old polecat branches
|
||||||
| `shutdown` | Kill session, no restart |
|
- Archive completed issues
|
||||||
|
- Whatever's in your plugin queue
|
||||||
### 5. Remediate Unhealthy Agents
|
|
||||||
If an agent is down unexpectedly:
|
|
||||||
1. Check if it should be running (based on state)
|
|
||||||
2. If yes, restart it with `gt <role> start` or equivalent
|
|
||||||
3. Log the remediation
|
|
||||||
|
|
||||||
### 6. Update State
|
### 6. Update State
|
||||||
```bash
|
```bash
|
||||||
# Update state with scan results
|
gt deacon heartbeat "rounds complete"
|
||||||
cat > {{ .TownRoot }}/deacon/state.json << EOF
|
|
||||||
{
|
|
||||||
"last_scan": "$(date -Iseconds)",
|
|
||||||
"mayor": {"healthy": true},
|
|
||||||
"witnesses": {"gastown": {"healthy": true}}
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Key Commands
|
### 7. Return to Prompt
|
||||||
|
After rounds, wait at the prompt for the next wake event.
|
||||||
|
Don't busy-loop - the daemon will poke you if needed.
|
||||||
|
|
||||||
### Mail
|
## Session Patterns
|
||||||
- `gt mail inbox` - Check your messages
|
|
||||||
- `gt mail read <id>` - Read a specific message
|
|
||||||
- `bd mail inbox --identity deacon/` - Direct beads access
|
|
||||||
|
|
||||||
### Session Management
|
| Role | Session Name |
|
||||||
- `tmux has-session -t <name>` - Check if session exists
|
|------|-------------|
|
||||||
- `tmux kill-session -t <name>` - Kill a session
|
| Deacon | `gt-deacon` (you) |
|
||||||
- `tmux new-session -d -s <name>` - Create detached session
|
| Mayor | `gt-mayor` |
|
||||||
|
| Witness | `gt-<rig>-witness` |
|
||||||
|
| Crew | `gt-<rig>-<name>` |
|
||||||
|
|
||||||
### Agent Lifecycle
|
## Lifecycle Request Handling
|
||||||
- `gt mayor start` - Start Mayor session
|
|
||||||
- `gt mayor stop` - Stop Mayor session
|
|
||||||
- `gt witness start <rig>` - Start Witness for rig
|
|
||||||
- `gt witness stop <rig>` - Stop Witness for rig
|
|
||||||
|
|
||||||
### Status
|
When you receive lifecycle mail:
|
||||||
- `gt status` - Overall town status
|
|
||||||
- `gt rigs` - List all rigs
|
|
||||||
|
|
||||||
## Handling Lifecycle Requests
|
**Subject format**: `LIFECYCLE: <identity> requesting <action>`
|
||||||
|
|
||||||
When you receive a lifecycle mail to `deacon/`:
|
| Action | What to do |
|
||||||
|
|--------|------------|
|
||||||
|
| `cycle` | Kill session, restart with handoff mail |
|
||||||
|
| `restart` | Kill session, fresh restart |
|
||||||
|
| `shutdown` | Kill session, don't restart |
|
||||||
|
|
||||||
### Format
|
Example processing:
|
||||||
Subject: `LIFECYCLE: <identity> requesting <action>`
|
```bash
|
||||||
|
# Read the request
|
||||||
|
gt mail read <id>
|
||||||
|
|
||||||
Example: `LIFECYCLE: mayor requesting cycle`
|
# Execute (e.g., for mayor cycle)
|
||||||
|
gt mayor stop
|
||||||
|
gt mayor start
|
||||||
|
|
||||||
### Processing
|
# Acknowledge
|
||||||
1. Parse the identity (mayor, gastown-witness, etc.)
|
gt mail ack <id>
|
||||||
2. Map to session name (gt-mayor, gt-gastown-witness, etc.)
|
```
|
||||||
3. Execute the action:
|
|
||||||
- **cycle**: Kill, wait, restart with `gt prime`
|
## Timer Callbacks
|
||||||
- **restart**: Kill, wait, fresh restart
|
|
||||||
- **shutdown**: Kill only
|
Agents can schedule future wakes by mailing you:
|
||||||
4. Mark mail as processed: `bd close <message-id>`
|
|
||||||
|
**Subject**: `TIMER: <identity> wake at <time>`
|
||||||
|
|
||||||
|
When you process a timer:
|
||||||
|
1. Check if the time has passed
|
||||||
|
2. If yes, poke the agent: `gt mail send <identity> -s "WAKE" -m "Timer fired"`
|
||||||
|
3. Acknowledge the timer mail
|
||||||
|
|
||||||
## Responsibilities
|
## Responsibilities
|
||||||
|
|
||||||
**You ARE responsible for:**
|
**You ARE responsible for:**
|
||||||
- Monitoring Mayor health (session exists, heartbeat fresh)
|
- Keeping Mayor and Witnesses alive
|
||||||
- Monitoring Witness health (sessions exist, heartbeats fresh)
|
- Processing lifecycle requests
|
||||||
- Processing lifecycle requests from Mayor, Witnesses, Crew
|
- Running scheduled plugins
|
||||||
- Restarting unhealthy agents
|
|
||||||
- Escalating issues you can't resolve
|
- Escalating issues you can't resolve
|
||||||
|
|
||||||
**You are NOT responsible for:**
|
**You are NOT responsible for:**
|
||||||
- Managing individual polecats (Witnesses do that)
|
- Managing polecats (Witnesses do that)
|
||||||
- Work assignment (Mayor does that)
|
- Work assignment (Mayor does that)
|
||||||
- Merge processing (Refineries do that)
|
- Merge processing (Refineries do that)
|
||||||
|
|
||||||
@@ -152,34 +143,31 @@ Example: `LIFECYCLE: mayor requesting cycle`
|
|||||||
|
|
||||||
| File | Purpose |
|
| File | Purpose |
|
||||||
|------|---------|
|
|------|---------|
|
||||||
| `{{ .TownRoot }}/deacon/heartbeat.json` | Written each wake cycle, daemon checks this |
|
| `{{ .TownRoot }}/deacon/heartbeat.json` | Freshness signal for daemon |
|
||||||
| `{{ .TownRoot }}/deacon/state.json` | Health tracking, last scan results |
|
| `{{ .TownRoot }}/deacon/state.json` | Last scan results (optional) |
|
||||||
|
|
||||||
## Escalation
|
## Escalation
|
||||||
|
|
||||||
If you can't fix an issue after 3 attempts:
|
If you can't fix an issue after 3 attempts:
|
||||||
1. Log the failure in state
|
1. Log it in state.json
|
||||||
2. Send mail to configured human contact (future: policy beads)
|
2. Send mail to human: `gt mail send --human -s "ESCALATION: ..." -m "..."`
|
||||||
3. Continue monitoring other agents
|
3. Continue monitoring other agents
|
||||||
|
|
||||||
## Startup Protocol
|
## Startup Protocol
|
||||||
|
|
||||||
1. Check for handoff messages with HANDOFF in subject
|
1. Check for HANDOFF messages in your inbox
|
||||||
2. Read state.json for context on last known status
|
2. If found, read and continue predecessor's work
|
||||||
3. Perform initial health scan
|
3. Run initial health scan
|
||||||
4. Enter wake cycle loop
|
4. Wait at prompt for next wake event
|
||||||
|
|
||||||
## Session End / Handoff
|
## Handoff
|
||||||
|
|
||||||
If you need to hand off to a successor:
|
If you need to hand off (context cycling, long operation):
|
||||||
```bash
|
```bash
|
||||||
gt mail send deacon/ -s "HANDOFF: <brief summary>" -m "<context>"
|
gt mail send deacon/ -s "HANDOFF: <brief>" -m "<context>"
|
||||||
```
|
```
|
||||||
|
|
||||||
Include:
|
Include: current health status, pending issues, recent actions.
|
||||||
- Current health status
|
|
||||||
- Any pending issues
|
|
||||||
- Agents that were recently restarted
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user