Implement Boot: daemon entry point dog for Deacon triage (gt-rwd5j)

Boot is a watchdog that the daemon pokes instead of Deacon directly,
centralizing the 'when to wake Deacon' decision in an agent that can
reason about context.

Key changes:
- Add internal/boot package with marker file and status tracking
- Add gt boot commands: status, spawn, triage
- Add mol-boot-triage formula for Boot's triage cycle
- Modify daemon to call ensureBootRunning instead of ensureDeaconRunning
- Add tmux.IsAvailable() for degraded mode detection
- Add boot.md.tmpl role template

Boot lifecycle:
1. Daemon tick spawns Boot (fresh each time)
2. Boot runs triage: observe, decide, act
3. Boot cleans stale handoffs from Deacon inbox
4. Boot exits (or handoffs in non-degraded mode)

In degraded mode (no tmux), Boot runs mechanical triage directly.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-12-30 16:14:54 -08:00
parent 3099d99424
commit 2112804aba
6 changed files with 991 additions and 2 deletions

231
internal/boot/boot.go Normal file
View File

@@ -0,0 +1,231 @@
// Package boot manages the Boot watchdog - the daemon's entry point for Deacon triage.
// Boot is a dog that runs fresh on each daemon tick, deciding whether to wake/nudge/interrupt
// the Deacon or let it continue. This centralizes the "when to wake" decision in an agent.
package boot
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"time"
"github.com/steveyegge/gastown/internal/tmux"
)
// SessionName is the tmux session name for Boot.
const SessionName = "gt-deacon-boot"
// MarkerFileName is the file that indicates Boot is currently running.
const MarkerFileName = ".boot-running"
// StatusFileName stores Boot's last execution status.
const StatusFileName = ".boot-status.json"
// DefaultMarkerTTL is how long a marker is considered valid before it's stale.
const DefaultMarkerTTL = 5 * time.Minute
// Status represents Boot's execution status.
type Status struct {
Running bool `json:"running"`
StartedAt time.Time `json:"started_at,omitempty"`
CompletedAt time.Time `json:"completed_at,omitempty"`
LastAction string `json:"last_action,omitempty"` // start/wake/nudge/nothing
Target string `json:"target,omitempty"` // deacon, witness, etc.
Error string `json:"error,omitempty"`
}
// Boot manages the Boot watchdog lifecycle.
type Boot struct {
townRoot string
bootDir string // ~/gt/deacon/dogs/boot/
deaconDir string // ~/gt/deacon/
tmux *tmux.Tmux
degraded bool
}
// New creates a new Boot manager.
func New(townRoot string) *Boot {
return &Boot{
townRoot: townRoot,
bootDir: filepath.Join(townRoot, "deacon", "dogs", "boot"),
deaconDir: filepath.Join(townRoot, "deacon"),
tmux: tmux.NewTmux(),
degraded: os.Getenv("GT_DEGRADED") == "true",
}
}
// EnsureDir ensures the Boot directory exists.
func (b *Boot) EnsureDir() error {
return os.MkdirAll(b.bootDir, 0755)
}
// markerPath returns the path to the marker file.
func (b *Boot) markerPath() string {
return filepath.Join(b.bootDir, MarkerFileName)
}
// statusPath returns the path to the status file.
func (b *Boot) statusPath() string {
return filepath.Join(b.bootDir, StatusFileName)
}
// IsRunning checks if Boot is currently running.
// Returns true if marker exists and isn't stale, false otherwise.
func (b *Boot) IsRunning() bool {
info, err := os.Stat(b.markerPath())
if err != nil {
return false
}
// Check if marker is stale (older than TTL)
age := time.Since(info.ModTime())
if age > DefaultMarkerTTL {
// Stale marker - clean it up
_ = os.Remove(b.markerPath())
return false
}
return true
}
// IsSessionAlive checks if the Boot tmux session exists.
func (b *Boot) IsSessionAlive() bool {
has, err := b.tmux.HasSession(SessionName)
return err == nil && has
}
// AcquireLock creates the marker file to indicate Boot is starting.
// Returns error if Boot is already running.
func (b *Boot) AcquireLock() error {
if b.IsRunning() {
return fmt.Errorf("boot is already running (marker exists)")
}
if err := b.EnsureDir(); err != nil {
return fmt.Errorf("ensuring boot dir: %w", err)
}
// Create marker file
f, err := os.Create(b.markerPath())
if err != nil {
return fmt.Errorf("creating marker: %w", err)
}
return f.Close()
}
// ReleaseLock removes the marker file.
func (b *Boot) ReleaseLock() error {
return os.Remove(b.markerPath())
}
// SaveStatus saves Boot's execution status.
func (b *Boot) SaveStatus(status *Status) error {
if err := b.EnsureDir(); err != nil {
return err
}
data, err := json.MarshalIndent(status, "", " ")
if err != nil {
return err
}
return os.WriteFile(b.statusPath(), data, 0644)
}
// LoadStatus loads Boot's last execution status.
func (b *Boot) LoadStatus() (*Status, error) {
data, err := os.ReadFile(b.statusPath())
if err != nil {
if os.IsNotExist(err) {
return &Status{}, nil
}
return nil, err
}
var status Status
if err := json.Unmarshal(data, &status); err != nil {
return nil, err
}
return &status, nil
}
// Spawn starts Boot in a fresh tmux session.
// Boot runs the mol-boot-triage molecule and exits when done.
// In degraded mode (no tmux), it runs in a subprocess.
func (b *Boot) Spawn() error {
if b.IsRunning() {
return fmt.Errorf("boot is already running")
}
// Check for degraded mode
if b.degraded {
return b.spawnDegraded()
}
return b.spawnTmux()
}
// spawnTmux spawns Boot in a tmux session.
func (b *Boot) spawnTmux() error {
// Kill any stale session first
if b.IsSessionAlive() {
_ = b.tmux.KillSession(SessionName)
}
// Create new session in deacon directory
if err := b.tmux.NewSession(SessionName, b.deaconDir); err != nil {
return fmt.Errorf("creating boot session: %w", err)
}
// Set environment
_ = b.tmux.SetEnvironment(SessionName, "GT_ROLE", "boot")
_ = b.tmux.SetEnvironment(SessionName, "BD_ACTOR", "deacon-boot")
// Launch Claude with environment exported inline
startCmd := "export GT_ROLE=boot BD_ACTOR=deacon-boot && claude --dangerously-skip-permissions"
if err := b.tmux.SendKeys(SessionName, startCmd); err != nil {
return fmt.Errorf("sending startup command: %w", err)
}
return nil
}
// spawnDegraded spawns Boot in degraded mode (no tmux).
// Boot runs to completion and exits without handoff.
func (b *Boot) spawnDegraded() error {
// In degraded mode, we run gt boot triage directly
// This performs the triage logic without a full Claude session
cmd := exec.Command("gt", "boot", "triage", "--degraded")
cmd.Dir = b.deaconDir
cmd.Env = append(os.Environ(),
"GT_ROLE=boot",
"BD_ACTOR=deacon-boot",
"GT_DEGRADED=true",
)
// Run async - don't wait for completion
return cmd.Start()
}
// IsDegraded returns whether Boot is in degraded mode.
func (b *Boot) IsDegraded() bool {
return b.degraded
}
// Dir returns Boot's working directory.
func (b *Boot) Dir() string {
return b.bootDir
}
// DeaconDir returns the Deacon's directory.
func (b *Boot) DeaconDir() string {
return b.deaconDir
}
// Tmux returns the tmux manager.
func (b *Boot) Tmux() *tmux.Tmux {
return b.tmux
}

321
internal/cmd/boot.go Normal file
View File

@@ -0,0 +1,321 @@
package cmd
import (
"encoding/json"
"fmt"
"os"
"time"
"github.com/spf13/cobra"
"github.com/steveyegge/gastown/internal/boot"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/workspace"
)
var (
bootStatusJSON bool
bootDegraded bool
)
var bootCmd = &cobra.Command{
Use: "boot",
GroupID: GroupAgents,
Short: "Manage Boot (Deacon watchdog)",
Long: `Manage Boot - the daemon's watchdog for Deacon triage.
Boot is a special dog that runs fresh on each daemon tick. It observes
the system state and decides whether to start/wake/nudge/interrupt the
Deacon, or do nothing. This centralizes the "when to wake" decision in
an agent that can reason about it.
Boot lifecycle:
1. Daemon tick spawns Boot (fresh each time)
2. Boot runs triage: observe, decide, act
3. Boot cleans inbox (discards stale handoffs)
4. Boot exits (or handoffs in non-degraded mode)
Location: ~/gt/deacon/dogs/boot/
Session: gt-deacon-boot`,
}
var bootStatusCmd = &cobra.Command{
Use: "status",
Short: "Show Boot status",
Long: `Show Boot's current status and last execution.
Displays:
- Whether Boot is currently running
- Last action taken (start/wake/nudge/nothing)
- Timing information
- Degraded mode status`,
RunE: runBootStatus,
}
var bootSpawnCmd = &cobra.Command{
Use: "spawn",
Short: "Spawn Boot for triage",
Long: `Spawn Boot to run the triage cycle.
This is normally called by the daemon. It spawns Boot in a fresh
tmux session (or subprocess in degraded mode) to observe and decide
what action to take on the Deacon.
Boot runs to completion and exits - it doesn't maintain state
between invocations.`,
RunE: runBootSpawn,
}
var bootTriageCmd = &cobra.Command{
Use: "triage",
Short: "Run triage directly (degraded mode)",
Long: `Run Boot's triage logic directly without Claude.
This is for degraded mode operation when tmux is unavailable.
It performs basic observation and takes conservative action:
- If Deacon is not running: start it
- If Deacon appears stuck: attempt restart
- Otherwise: do nothing
Use --degraded flag when running in degraded mode.`,
RunE: runBootTriage,
}
func init() {
bootStatusCmd.Flags().BoolVar(&bootStatusJSON, "json", false, "Output as JSON")
bootTriageCmd.Flags().BoolVar(&bootDegraded, "degraded", false, "Run in degraded mode (no tmux)")
bootCmd.AddCommand(bootStatusCmd)
bootCmd.AddCommand(bootSpawnCmd)
bootCmd.AddCommand(bootTriageCmd)
rootCmd.AddCommand(bootCmd)
}
func getBootManager() (*boot.Boot, error) {
townRoot, err := workspace.FindFromCwd()
if err != nil {
return nil, fmt.Errorf("finding town root: %w", err)
}
return boot.New(townRoot), nil
}
func runBootStatus(cmd *cobra.Command, args []string) error {
b, err := getBootManager()
if err != nil {
return err
}
status, err := b.LoadStatus()
if err != nil {
return fmt.Errorf("loading status: %w", err)
}
isRunning := b.IsRunning()
sessionAlive := b.IsSessionAlive()
if bootStatusJSON {
output := map[string]interface{}{
"running": isRunning,
"session_alive": sessionAlive,
"degraded": b.IsDegraded(),
"boot_dir": b.Dir(),
"last_status": status,
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
return enc.Encode(output)
}
// Pretty print
fmt.Println(style.Bold.Render("Boot Status"))
fmt.Println()
if isRunning {
fmt.Printf(" State: %s\n", style.Bold.Render("running"))
} else {
fmt.Printf(" State: %s\n", style.Dim.Render("idle"))
}
if sessionAlive {
fmt.Printf(" Session: %s (alive)\n", boot.SessionName)
} else {
fmt.Printf(" Session: %s\n", style.Dim.Render("not running"))
}
if b.IsDegraded() {
fmt.Printf(" Mode: %s\n", style.Bold.Render("DEGRADED"))
} else {
fmt.Printf(" Mode: normal\n")
}
fmt.Println()
fmt.Println(style.Dim.Render("Last Execution:"))
if status.StartedAt.IsZero() {
fmt.Printf(" %s\n", style.Dim.Render("(no executions recorded)"))
} else {
if !status.CompletedAt.IsZero() {
duration := status.CompletedAt.Sub(status.StartedAt)
fmt.Printf(" Completed: %s (%s ago)\n",
status.CompletedAt.Format("15:04:05"),
formatDurationAgo(time.Since(status.CompletedAt)))
fmt.Printf(" Duration: %s\n", duration.Round(time.Millisecond))
} else {
fmt.Printf(" Started: %s\n", status.StartedAt.Format("15:04:05"))
}
if status.LastAction != "" {
fmt.Printf(" Action: %s", status.LastAction)
if status.Target != "" {
fmt.Printf(" → %s", status.Target)
}
fmt.Println()
}
if status.Error != "" {
fmt.Printf(" Error: %s\n", style.Bold.Render(status.Error))
}
}
fmt.Println()
fmt.Printf(" Dir: %s\n", b.Dir())
return nil
}
func runBootSpawn(cmd *cobra.Command, args []string) error {
b, err := getBootManager()
if err != nil {
return err
}
if b.IsRunning() {
fmt.Println("Boot is already running - skipping spawn")
return nil
}
// Save starting status
status := &boot.Status{
Running: true,
StartedAt: time.Now(),
}
if err := b.SaveStatus(status); err != nil {
return fmt.Errorf("saving status: %w", err)
}
// Spawn Boot
if err := b.Spawn(); err != nil {
status.Error = err.Error()
status.CompletedAt = time.Now()
status.Running = false
_ = b.SaveStatus(status)
return fmt.Errorf("spawning boot: %w", err)
}
if b.IsDegraded() {
fmt.Println("Boot spawned in degraded mode (subprocess)")
} else {
fmt.Printf("Boot spawned in session: %s\n", boot.SessionName)
}
return nil
}
func runBootTriage(cmd *cobra.Command, args []string) error {
b, err := getBootManager()
if err != nil {
return err
}
// Acquire lock
if err := b.AcquireLock(); err != nil {
return fmt.Errorf("acquiring lock: %w", err)
}
defer func() { _ = b.ReleaseLock() }()
startTime := time.Now()
status := &boot.Status{
Running: true,
StartedAt: startTime,
}
// In degraded mode, we do basic mechanical triage
// without full Claude reasoning capability
action, target, triageErr := runDegradedTriage(b)
status.LastAction = action
status.Target = target
status.Running = false
status.CompletedAt = time.Now()
if triageErr != nil {
status.Error = triageErr.Error()
}
if err := b.SaveStatus(status); err != nil {
return fmt.Errorf("saving status: %w", err)
}
if triageErr != nil {
return triageErr
}
fmt.Printf("Triage complete: %s", action)
if target != "" {
fmt.Printf(" → %s", target)
}
fmt.Println()
return nil
}
// runDegradedTriage performs basic Deacon health check without AI reasoning.
// This is a mechanical fallback when full Claude sessions aren't available.
func runDegradedTriage(b *boot.Boot) (action, target string, err error) {
tm := b.Tmux()
// Check if Deacon session exists
deaconSession := "gt-deacon"
hasDeacon, err := tm.HasSession(deaconSession)
if err != nil {
return "error", "deacon", fmt.Errorf("checking deacon session: %w", err)
}
if !hasDeacon {
// Deacon not running - this is unusual, daemon should have restarted it
// In degraded mode, we just report - let daemon handle restart
return "report", "deacon-missing", nil
}
// Deacon exists - check if it's responsive (basic pane output check)
// In degraded mode, we can't do sophisticated analysis
// Just verify the session is alive
return "nothing", "", nil
}
// formatDurationAgo formats a duration for human display.
func formatDurationAgo(d time.Duration) string {
switch {
case d < time.Minute:
return "just now"
case d < time.Hour:
mins := int(d.Minutes())
if mins == 1 {
return "1 min"
}
return fmt.Sprintf("%d min", mins)
case d < 24*time.Hour:
hours := int(d.Hours())
if hours == 1 {
return "1 hour"
}
return fmt.Sprintf("%d hours", hours)
default:
days := int(d.Hours() / 24)
if days == 1 {
return "1 day"
}
return fmt.Sprintf("%d days", days)
}
}

View File

@@ -14,6 +14,7 @@ import (
"time"
"github.com/steveyegge/gastown/internal/beads"
"github.com/steveyegge/gastown/internal/boot"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/feed"
"github.com/steveyegge/gastown/internal/keepalive"
@@ -189,8 +190,9 @@ func (d *Daemon) calculateHeartbeatInterval() time.Duration {
func (d *Daemon) heartbeat(state *State) {
d.logger.Println("Heartbeat starting (recovery-focused)")
// 1. Ensure Deacon is running (restart if dead)
d.ensureDeaconRunning()
// 1. Poke Boot (the Deacon's watchdog) instead of Deacon directly
// Boot handles the "when to wake Deacon" decision via triage logic
d.ensureBootRunning()
// 2. Ensure Witnesses are running for all rigs (restart if dead)
d.ensureWitnessesRunning()
@@ -233,6 +235,71 @@ const DeaconSessionName = "gt-deacon"
// DeaconRole is the role name for the Deacon's handoff bead.
const DeaconRole = "deacon"
// ensureBootRunning spawns Boot to triage the Deacon.
// Boot is a fresh-each-tick watchdog that decides whether to start/wake/nudge
// the Deacon, centralizing the "when to wake" decision in an agent.
// In degraded mode (no tmux), falls back to mechanical checks.
func (d *Daemon) ensureBootRunning() {
b := boot.New(d.config.TownRoot)
// Check if Boot is already running (recent marker)
if b.IsRunning() {
d.logger.Println("Boot already running, skipping spawn")
return
}
// Check for degraded mode
degraded := os.Getenv("GT_DEGRADED") == "true"
if degraded || !d.tmux.IsAvailable() {
// In degraded mode, run mechanical triage directly
d.logger.Println("Degraded mode: running mechanical Boot triage")
d.runDegradedBootTriage(b)
return
}
// Spawn Boot in a fresh tmux session
d.logger.Println("Spawning Boot for triage...")
if err := b.Spawn(); err != nil {
d.logger.Printf("Error spawning Boot: %v, falling back to direct Deacon check", err)
// Fallback: ensure Deacon is running directly
d.ensureDeaconRunning()
return
}
d.logger.Println("Boot spawned successfully")
}
// runDegradedBootTriage performs mechanical Boot logic without AI reasoning.
// This is for degraded mode when tmux is unavailable.
func (d *Daemon) runDegradedBootTriage(b *boot.Boot) {
startTime := time.Now()
status := &boot.Status{
Running: true,
StartedAt: startTime,
}
// Simple check: is Deacon session alive?
hasDeacon, err := d.tmux.HasSession(DeaconSessionName)
if err != nil {
d.logger.Printf("Error checking Deacon session: %v", err)
status.LastAction = "error"
status.Error = err.Error()
} else if !hasDeacon {
d.logger.Println("Deacon not running, starting...")
d.ensureDeaconRunning()
status.LastAction = "start"
status.Target = "deacon"
} else {
status.LastAction = "nothing"
}
status.Running = false
status.CompletedAt = time.Now()
if err := b.SaveStatus(status); err != nil {
d.logger.Printf("Warning: failed to save Boot status: %v", err)
}
}
// ensureDeaconRunning ensures the Deacon is running.
// ZFC-compliant: trusts agent bead state, no tmux inference.

View File

@@ -0,0 +1,136 @@
# Boot Context
> **Recovery**: Run `gt prime` after compaction, clear, or new session
## Your Role: BOOT (Deacon Watchdog)
You are **Boot** - the daemon's watchdog for Deacon triage. You are spawned fresh
on each daemon tick to observe the system and decide what action to take.
## Theory of Operation
The daemon is dumb transport (ZFC principle). It can't decide:
- Is the Deacon stuck or just thinking?
- Should we interrupt or let it continue?
- Is the system in a state where nudging would help?
You are an agent that CAN observe and decide. The daemon pokes you instead of
the Deacon directly, centralizing the "when to wake" decision in reasoning.
## Your Lifecycle
```
Daemon tick
├── Check: Is Boot already running? (marker file)
│ └── Yes + recent: Skip this tick
└── Spawn Boot (fresh session each time)
└── Boot runs triage
├── Observe (wisps, mail, git state, tmux panes)
├── Decide (start/wake/nudge/interrupt/nothing)
├── Act
├── Clean inbox (discard stale handoffs)
└── Exit (or handoff in non-degraded mode)
```
## You Are Always Fresh
Boot restarts on each daemon tick. This is intentional:
- Narrow scope makes restarts cheap
- Fresh context avoids accumulated confusion
- Handoff mail provides continuity without session persistence
- No keepalive needed
## Working Directory
**IMPORTANT**: Always work from `{{ .TownRoot }}/deacon/` directory.
You share context with the Deacon - both operate on the same state.
## Triage Steps
### Step 1: Observe
Check the current system state:
```bash
# Is Deacon session alive?
tmux has-session -t gt-deacon 2>/dev/null && echo "alive" || echo "dead"
# If alive, what's the pane showing?
gt peek deacon --lines 20
# Agent bead state
bd show gt-deacon 2>/dev/null
# Recent activity
gt feed --since 10m --plain | head -20
```
### Step 2: Decide
Analyze observations using this decision matrix:
| Deacon State | Pane Activity | Action |
|--------------|---------------|--------|
| Dead session | N/A | START (daemon will restart) |
| Alive, active output | N/A | NOTHING |
| Alive, idle < 5 min | N/A | NOTHING |
| Alive, idle 5-15 min | No mail | NOTHING |
| Alive, idle 5-15 min | Has mail | NUDGE |
| Alive, idle > 15 min | Any | WAKE |
| Alive, stuck (errors) | Any | INTERRUPT |
**Judgment Guidance**: Agents may take several minutes on legitimate work.
Don't be too aggressive - false positives are disruptive.
### Step 3: Act
Execute the decided action:
- **NOTHING**: Log and exit
- **NUDGE**: `gt nudge deacon "Boot check-in: you have pending work"`
- **WAKE**: Escape + `gt nudge deacon "Boot wake: check your inbox"`
- **INTERRUPT**: Mail the Deacon requesting restart consideration
- **START**: Log detection (daemon handles restart)
### Step 4: Clean
Archive stale handoff messages (> 1 hour old) from Deacon's inbox.
### Step 5: Exit
In degraded mode: Exit directly.
In normal mode: Optional brief handoff mail for next Boot instance.
## Degraded Mode (GT_DEGRADED=true)
When tmux is unavailable:
- Cannot observe tmux panes
- Cannot interactively interrupt
- Focus on beads/git state observation only
- Report anomalies but can't fix interactively
- Run to completion and exit (no handoff)
## Commands
```bash
# Your status
gt boot status
# Manual spawn (for debugging)
gt boot spawn
# Run triage directly (degraded mode)
gt boot triage --degraded
```
## Important Notes
- You are ephemeral - no persistent state between invocations
- Each tick is a fresh observation
- Be conservative - false positives disrupt legitimate work
- When in doubt, choose NOTHING over NUDGE
- Trust the Deacon unless there's clear evidence of stuck state

View File

@@ -91,6 +91,12 @@ func (t *Tmux) KillServer() error {
return err
}
// IsAvailable checks if tmux is installed and can be invoked.
func (t *Tmux) IsAvailable() bool {
cmd := exec.Command("tmux", "-V")
return cmd.Run() == nil
}
// HasSession checks if a session exists.
func (t *Tmux) HasSession(name string) (bool, error) {
_, err := t.run("has-session", "-t", name)