diff --git a/.beads/formulas/mol-boot-triage.formula.toml b/.beads/formulas/mol-boot-triage.formula.toml new file mode 100644 index 00000000..9380580d --- /dev/null +++ b/.beads/formulas/mol-boot-triage.formula.toml @@ -0,0 +1,228 @@ +description = """ +Boot triage cycle - the daemon's watchdog for Deacon health. + +Boot is spawned fresh on each daemon tick to decide whether to start/wake/nudge/interrupt +the Deacon, or do nothing. This centralizes the "when to wake" decision in an agent that +can reason about context rather than relying on mechanical thresholds. + +Boot lifecycle: +1. Observe (wisps, mail, git state, tmux panes) +2. Decide (start/wake/nudge/interrupt/nothing) +3. Act +4. Clean inbox (discard stale handoffs) +5. Exit (or handoff in non-degraded mode) + +Boot is always fresh - no persistent state between invocations. +Handoff mail provides continuity for the next Boot instance. +""" +formula = "mol-boot-triage" +version = 1 + +[[steps]] +id = "observe" +title = "Observe system state" +description = """ +Observe the current system state to inform triage decisions. + +**Step 1: Check Deacon state** +```bash +# Is Deacon session alive? +tmux has-session -t gt-deacon 2>/dev/null && echo "alive" || echo "dead" + +# If alive, what's the pane output showing? +gt peek deacon --lines 20 +``` + +**Step 2: Check agent bead state** +```bash +bd show gt-deacon 2>/dev/null +# Look for: +# - state: running/working/idle +# - last_activity: when was last update? +``` + +**Step 3: Check recent activity** +```bash +# Recent feed events +gt feed --since 10m --plain | head -20 + +# Recent wisps (operational state) +ls -lt ~/gt/.beads-wisp/*.wisp.json 2>/dev/null | head -5 +``` + +**Step 4: Check Deacon mail** +```bash +# Does Deacon have unread mail? +gt mail inbox deacon 2>/dev/null | head -10 +``` + +Record observations for the decide step: +- deacon_alive: true/false +- pane_activity: active/idle/stuck +- last_activity_age: duration since last activity +- pending_mail: count of unread messages +- error_signals: any errors observed +""" + +[[steps]] +id = "decide" +title = "Decide on action" +needs = ["observe"] +description = """ +Analyze observations and decide what action to take. + +**Decision Matrix** + +| Deacon State | Pane Activity | Action | +|--------------|---------------|--------| +| Dead session | N/A | START | +| Alive, active output | N/A | NOTHING | +| Alive, idle < 5 min | N/A | NOTHING | +| Alive, idle 5-15 min | No mail | NOTHING | +| Alive, idle 5-15 min | Has mail | NUDGE | +| Alive, idle > 15 min | Any | WAKE | +| Alive, stuck (errors) | Any | INTERRUPT | + +**Judgment Guidance** + +Agents may take several minutes on legitimate work. Ten minutes or more in edge cases. +Don't be too aggressive - false positives are disruptive. + +Signs of stuck: +- Same error repeated in pane +- Tool prompt waiting indefinitely +- Silence with pending mail +- Agent reporting issues but not progressing + +Signs of working: +- Tool calls in progress +- File reads/writes happening +- Recent commits or beads updates + +**Output**: Record decision as one of: +- NOTHING: Let Deacon continue +- NUDGE: Gentle wake signal (gt nudge) +- WAKE: Stronger wake (escape + message) +- INTERRUPT: Force restart needed +- START: Session is dead, start fresh +""" + +[[steps]] +id = "act" +title = "Execute decided action" +needs = ["decide"] +description = """ +Execute the action decided in the previous step. + +**NOTHING** +No action needed. Log observation and exit. + +**NUDGE** +```bash +gt nudge deacon "Boot check-in: you have pending work" +``` + +**WAKE** +```bash +# Send escape to break any tool waiting +tmux send-keys -t gt-deacon Escape + +# Brief pause +sleep 1 + +# Send wake message +gt nudge deacon "Boot wake: please check your inbox and pending work" +``` + +**INTERRUPT** +```bash +# This is more aggressive - signals Deacon to restart +gt mail send deacon -s "INTERRUPT: Boot detected stuck state" \ + -m "Boot observed stuck state. Please check your context and consider handoff. + +Observations: +- + +If you're making progress, please update your agent bead to reflect activity." +``` + +**START** +```bash +# Deacon is dead - daemon will restart it +# Just log that we detected this +echo "Boot detected dead Deacon session - daemon will restart" +``` + +Record action taken for status update. +""" + +[[steps]] +id = "cleanup" +title = "Clean stale handoffs" +needs = ["act"] +description = """ +Clean up stale handoff messages from Deacon's inbox. + +Handoff messages older than 1 hour are likely stale - the intended recipient +either processed them or crashed before seeing them. + +**Step 1: List Deacon inbox** +```bash +gt mail inbox deacon --json 2>/dev/null +``` + +**Step 2: Archive stale handoffs** +For each message: +- Check if subject contains "HANDOFF" or "handoff" +- Check if age > 1 hour +- If both: archive it + +```bash +# For each stale handoff: +gt mail archive +``` + +**Step 3: Archive Boot's own old mail** +Boot doesn't need persistent inbox. Archive anything processed: +```bash +gt mail inbox boot --json 2>/dev/null +# Archive any messages older than current session +``` + +Keep the system clean - old handoffs just add noise. +""" + +[[steps]] +id = "exit" +title = "Exit or handoff" +needs = ["cleanup"] +description = """ +Complete this Boot cycle. + +**In degraded mode (GT_DEGRADED=true)** +Exit directly - no handoff needed: +```bash +# Log completion +echo "Boot triage complete: " +exit 0 +``` + +**In normal mode** +Write brief handoff for next Boot instance: +```bash +gt mail send boot -s "Boot handoff" -m "Completed triage cycle. +Action: +Observations: +Time: $(date)" +``` + +Then exit. The next daemon tick will spawn a fresh Boot. + +**Update status file** +```bash +# The gt boot command handles this automatically +# Status is written to ~/gt/deacon/dogs/boot/.boot-status.json +``` + +Boot is ephemeral by design. Each instance runs fresh. +""" diff --git a/internal/boot/boot.go b/internal/boot/boot.go new file mode 100644 index 00000000..a8a7c715 --- /dev/null +++ b/internal/boot/boot.go @@ -0,0 +1,231 @@ +// Package boot manages the Boot watchdog - the daemon's entry point for Deacon triage. +// Boot is a dog that runs fresh on each daemon tick, deciding whether to wake/nudge/interrupt +// the Deacon or let it continue. This centralizes the "when to wake" decision in an agent. +package boot + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "time" + + "github.com/steveyegge/gastown/internal/tmux" +) + +// SessionName is the tmux session name for Boot. +const SessionName = "gt-deacon-boot" + +// MarkerFileName is the file that indicates Boot is currently running. +const MarkerFileName = ".boot-running" + +// StatusFileName stores Boot's last execution status. +const StatusFileName = ".boot-status.json" + +// DefaultMarkerTTL is how long a marker is considered valid before it's stale. +const DefaultMarkerTTL = 5 * time.Minute + +// Status represents Boot's execution status. +type Status struct { + Running bool `json:"running"` + StartedAt time.Time `json:"started_at,omitempty"` + CompletedAt time.Time `json:"completed_at,omitempty"` + LastAction string `json:"last_action,omitempty"` // start/wake/nudge/nothing + Target string `json:"target,omitempty"` // deacon, witness, etc. + Error string `json:"error,omitempty"` +} + +// Boot manages the Boot watchdog lifecycle. +type Boot struct { + townRoot string + bootDir string // ~/gt/deacon/dogs/boot/ + deaconDir string // ~/gt/deacon/ + tmux *tmux.Tmux + degraded bool +} + +// New creates a new Boot manager. +func New(townRoot string) *Boot { + return &Boot{ + townRoot: townRoot, + bootDir: filepath.Join(townRoot, "deacon", "dogs", "boot"), + deaconDir: filepath.Join(townRoot, "deacon"), + tmux: tmux.NewTmux(), + degraded: os.Getenv("GT_DEGRADED") == "true", + } +} + +// EnsureDir ensures the Boot directory exists. +func (b *Boot) EnsureDir() error { + return os.MkdirAll(b.bootDir, 0755) +} + +// markerPath returns the path to the marker file. +func (b *Boot) markerPath() string { + return filepath.Join(b.bootDir, MarkerFileName) +} + +// statusPath returns the path to the status file. +func (b *Boot) statusPath() string { + return filepath.Join(b.bootDir, StatusFileName) +} + +// IsRunning checks if Boot is currently running. +// Returns true if marker exists and isn't stale, false otherwise. +func (b *Boot) IsRunning() bool { + info, err := os.Stat(b.markerPath()) + if err != nil { + return false + } + + // Check if marker is stale (older than TTL) + age := time.Since(info.ModTime()) + if age > DefaultMarkerTTL { + // Stale marker - clean it up + _ = os.Remove(b.markerPath()) + return false + } + + return true +} + +// IsSessionAlive checks if the Boot tmux session exists. +func (b *Boot) IsSessionAlive() bool { + has, err := b.tmux.HasSession(SessionName) + return err == nil && has +} + +// AcquireLock creates the marker file to indicate Boot is starting. +// Returns error if Boot is already running. +func (b *Boot) AcquireLock() error { + if b.IsRunning() { + return fmt.Errorf("boot is already running (marker exists)") + } + + if err := b.EnsureDir(); err != nil { + return fmt.Errorf("ensuring boot dir: %w", err) + } + + // Create marker file + f, err := os.Create(b.markerPath()) + if err != nil { + return fmt.Errorf("creating marker: %w", err) + } + return f.Close() +} + +// ReleaseLock removes the marker file. +func (b *Boot) ReleaseLock() error { + return os.Remove(b.markerPath()) +} + +// SaveStatus saves Boot's execution status. +func (b *Boot) SaveStatus(status *Status) error { + if err := b.EnsureDir(); err != nil { + return err + } + + data, err := json.MarshalIndent(status, "", " ") + if err != nil { + return err + } + + return os.WriteFile(b.statusPath(), data, 0644) +} + +// LoadStatus loads Boot's last execution status. +func (b *Boot) LoadStatus() (*Status, error) { + data, err := os.ReadFile(b.statusPath()) + if err != nil { + if os.IsNotExist(err) { + return &Status{}, nil + } + return nil, err + } + + var status Status + if err := json.Unmarshal(data, &status); err != nil { + return nil, err + } + + return &status, nil +} + +// Spawn starts Boot in a fresh tmux session. +// Boot runs the mol-boot-triage molecule and exits when done. +// In degraded mode (no tmux), it runs in a subprocess. +func (b *Boot) Spawn() error { + if b.IsRunning() { + return fmt.Errorf("boot is already running") + } + + // Check for degraded mode + if b.degraded { + return b.spawnDegraded() + } + + return b.spawnTmux() +} + +// spawnTmux spawns Boot in a tmux session. +func (b *Boot) spawnTmux() error { + // Kill any stale session first + if b.IsSessionAlive() { + _ = b.tmux.KillSession(SessionName) + } + + // Create new session in deacon directory + if err := b.tmux.NewSession(SessionName, b.deaconDir); err != nil { + return fmt.Errorf("creating boot session: %w", err) + } + + // Set environment + _ = b.tmux.SetEnvironment(SessionName, "GT_ROLE", "boot") + _ = b.tmux.SetEnvironment(SessionName, "BD_ACTOR", "deacon-boot") + + // Launch Claude with environment exported inline + startCmd := "export GT_ROLE=boot BD_ACTOR=deacon-boot && claude --dangerously-skip-permissions" + if err := b.tmux.SendKeys(SessionName, startCmd); err != nil { + return fmt.Errorf("sending startup command: %w", err) + } + + return nil +} + +// spawnDegraded spawns Boot in degraded mode (no tmux). +// Boot runs to completion and exits without handoff. +func (b *Boot) spawnDegraded() error { + // In degraded mode, we run gt boot triage directly + // This performs the triage logic without a full Claude session + cmd := exec.Command("gt", "boot", "triage", "--degraded") + cmd.Dir = b.deaconDir + cmd.Env = append(os.Environ(), + "GT_ROLE=boot", + "BD_ACTOR=deacon-boot", + "GT_DEGRADED=true", + ) + + // Run async - don't wait for completion + return cmd.Start() +} + +// IsDegraded returns whether Boot is in degraded mode. +func (b *Boot) IsDegraded() bool { + return b.degraded +} + +// Dir returns Boot's working directory. +func (b *Boot) Dir() string { + return b.bootDir +} + +// DeaconDir returns the Deacon's directory. +func (b *Boot) DeaconDir() string { + return b.deaconDir +} + +// Tmux returns the tmux manager. +func (b *Boot) Tmux() *tmux.Tmux { + return b.tmux +} diff --git a/internal/cmd/boot.go b/internal/cmd/boot.go new file mode 100644 index 00000000..6becfd17 --- /dev/null +++ b/internal/cmd/boot.go @@ -0,0 +1,321 @@ +package cmd + +import ( + "encoding/json" + "fmt" + "os" + "time" + + "github.com/spf13/cobra" + "github.com/steveyegge/gastown/internal/boot" + "github.com/steveyegge/gastown/internal/style" + "github.com/steveyegge/gastown/internal/workspace" +) + +var ( + bootStatusJSON bool + bootDegraded bool +) + +var bootCmd = &cobra.Command{ + Use: "boot", + GroupID: GroupAgents, + Short: "Manage Boot (Deacon watchdog)", + Long: `Manage Boot - the daemon's watchdog for Deacon triage. + +Boot is a special dog that runs fresh on each daemon tick. It observes +the system state and decides whether to start/wake/nudge/interrupt the +Deacon, or do nothing. This centralizes the "when to wake" decision in +an agent that can reason about it. + +Boot lifecycle: + 1. Daemon tick spawns Boot (fresh each time) + 2. Boot runs triage: observe, decide, act + 3. Boot cleans inbox (discards stale handoffs) + 4. Boot exits (or handoffs in non-degraded mode) + +Location: ~/gt/deacon/dogs/boot/ +Session: gt-deacon-boot`, +} + +var bootStatusCmd = &cobra.Command{ + Use: "status", + Short: "Show Boot status", + Long: `Show Boot's current status and last execution. + +Displays: + - Whether Boot is currently running + - Last action taken (start/wake/nudge/nothing) + - Timing information + - Degraded mode status`, + RunE: runBootStatus, +} + +var bootSpawnCmd = &cobra.Command{ + Use: "spawn", + Short: "Spawn Boot for triage", + Long: `Spawn Boot to run the triage cycle. + +This is normally called by the daemon. It spawns Boot in a fresh +tmux session (or subprocess in degraded mode) to observe and decide +what action to take on the Deacon. + +Boot runs to completion and exits - it doesn't maintain state +between invocations.`, + RunE: runBootSpawn, +} + +var bootTriageCmd = &cobra.Command{ + Use: "triage", + Short: "Run triage directly (degraded mode)", + Long: `Run Boot's triage logic directly without Claude. + +This is for degraded mode operation when tmux is unavailable. +It performs basic observation and takes conservative action: + - If Deacon is not running: start it + - If Deacon appears stuck: attempt restart + - Otherwise: do nothing + +Use --degraded flag when running in degraded mode.`, + RunE: runBootTriage, +} + +func init() { + bootStatusCmd.Flags().BoolVar(&bootStatusJSON, "json", false, "Output as JSON") + bootTriageCmd.Flags().BoolVar(&bootDegraded, "degraded", false, "Run in degraded mode (no tmux)") + + bootCmd.AddCommand(bootStatusCmd) + bootCmd.AddCommand(bootSpawnCmd) + bootCmd.AddCommand(bootTriageCmd) + + rootCmd.AddCommand(bootCmd) +} + +func getBootManager() (*boot.Boot, error) { + townRoot, err := workspace.FindFromCwd() + if err != nil { + return nil, fmt.Errorf("finding town root: %w", err) + } + + return boot.New(townRoot), nil +} + +func runBootStatus(cmd *cobra.Command, args []string) error { + b, err := getBootManager() + if err != nil { + return err + } + + status, err := b.LoadStatus() + if err != nil { + return fmt.Errorf("loading status: %w", err) + } + + isRunning := b.IsRunning() + sessionAlive := b.IsSessionAlive() + + if bootStatusJSON { + output := map[string]interface{}{ + "running": isRunning, + "session_alive": sessionAlive, + "degraded": b.IsDegraded(), + "boot_dir": b.Dir(), + "last_status": status, + } + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + return enc.Encode(output) + } + + // Pretty print + fmt.Println(style.Bold.Render("Boot Status")) + fmt.Println() + + if isRunning { + fmt.Printf(" State: %s\n", style.Bold.Render("running")) + } else { + fmt.Printf(" State: %s\n", style.Dim.Render("idle")) + } + + if sessionAlive { + fmt.Printf(" Session: %s (alive)\n", boot.SessionName) + } else { + fmt.Printf(" Session: %s\n", style.Dim.Render("not running")) + } + + if b.IsDegraded() { + fmt.Printf(" Mode: %s\n", style.Bold.Render("DEGRADED")) + } else { + fmt.Printf(" Mode: normal\n") + } + + fmt.Println() + fmt.Println(style.Dim.Render("Last Execution:")) + + if status.StartedAt.IsZero() { + fmt.Printf(" %s\n", style.Dim.Render("(no executions recorded)")) + } else { + if !status.CompletedAt.IsZero() { + duration := status.CompletedAt.Sub(status.StartedAt) + fmt.Printf(" Completed: %s (%s ago)\n", + status.CompletedAt.Format("15:04:05"), + formatDurationAgo(time.Since(status.CompletedAt))) + fmt.Printf(" Duration: %s\n", duration.Round(time.Millisecond)) + } else { + fmt.Printf(" Started: %s\n", status.StartedAt.Format("15:04:05")) + } + + if status.LastAction != "" { + fmt.Printf(" Action: %s", status.LastAction) + if status.Target != "" { + fmt.Printf(" → %s", status.Target) + } + fmt.Println() + } + + if status.Error != "" { + fmt.Printf(" Error: %s\n", style.Bold.Render(status.Error)) + } + } + + fmt.Println() + fmt.Printf(" Dir: %s\n", b.Dir()) + + return nil +} + +func runBootSpawn(cmd *cobra.Command, args []string) error { + b, err := getBootManager() + if err != nil { + return err + } + + if b.IsRunning() { + fmt.Println("Boot is already running - skipping spawn") + return nil + } + + // Save starting status + status := &boot.Status{ + Running: true, + StartedAt: time.Now(), + } + if err := b.SaveStatus(status); err != nil { + return fmt.Errorf("saving status: %w", err) + } + + // Spawn Boot + if err := b.Spawn(); err != nil { + status.Error = err.Error() + status.CompletedAt = time.Now() + status.Running = false + _ = b.SaveStatus(status) + return fmt.Errorf("spawning boot: %w", err) + } + + if b.IsDegraded() { + fmt.Println("Boot spawned in degraded mode (subprocess)") + } else { + fmt.Printf("Boot spawned in session: %s\n", boot.SessionName) + } + + return nil +} + +func runBootTriage(cmd *cobra.Command, args []string) error { + b, err := getBootManager() + if err != nil { + return err + } + + // Acquire lock + if err := b.AcquireLock(); err != nil { + return fmt.Errorf("acquiring lock: %w", err) + } + defer func() { _ = b.ReleaseLock() }() + + startTime := time.Now() + status := &boot.Status{ + Running: true, + StartedAt: startTime, + } + + // In degraded mode, we do basic mechanical triage + // without full Claude reasoning capability + action, target, triageErr := runDegradedTriage(b) + + status.LastAction = action + status.Target = target + status.Running = false + status.CompletedAt = time.Now() + + if triageErr != nil { + status.Error = triageErr.Error() + } + + if err := b.SaveStatus(status); err != nil { + return fmt.Errorf("saving status: %w", err) + } + + if triageErr != nil { + return triageErr + } + + fmt.Printf("Triage complete: %s", action) + if target != "" { + fmt.Printf(" → %s", target) + } + fmt.Println() + + return nil +} + +// runDegradedTriage performs basic Deacon health check without AI reasoning. +// This is a mechanical fallback when full Claude sessions aren't available. +func runDegradedTriage(b *boot.Boot) (action, target string, err error) { + tm := b.Tmux() + + // Check if Deacon session exists + deaconSession := "gt-deacon" + hasDeacon, err := tm.HasSession(deaconSession) + if err != nil { + return "error", "deacon", fmt.Errorf("checking deacon session: %w", err) + } + + if !hasDeacon { + // Deacon not running - this is unusual, daemon should have restarted it + // In degraded mode, we just report - let daemon handle restart + return "report", "deacon-missing", nil + } + + // Deacon exists - check if it's responsive (basic pane output check) + // In degraded mode, we can't do sophisticated analysis + // Just verify the session is alive + return "nothing", "", nil +} + +// formatDurationAgo formats a duration for human display. +func formatDurationAgo(d time.Duration) string { + switch { + case d < time.Minute: + return "just now" + case d < time.Hour: + mins := int(d.Minutes()) + if mins == 1 { + return "1 min" + } + return fmt.Sprintf("%d min", mins) + case d < 24*time.Hour: + hours := int(d.Hours()) + if hours == 1 { + return "1 hour" + } + return fmt.Sprintf("%d hours", hours) + default: + days := int(d.Hours() / 24) + if days == 1 { + return "1 day" + } + return fmt.Sprintf("%d days", days) + } +} diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index f321dbb2..497d7a6c 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -14,6 +14,7 @@ import ( "time" "github.com/steveyegge/gastown/internal/beads" + "github.com/steveyegge/gastown/internal/boot" "github.com/steveyegge/gastown/internal/constants" "github.com/steveyegge/gastown/internal/feed" "github.com/steveyegge/gastown/internal/keepalive" @@ -189,8 +190,9 @@ func (d *Daemon) calculateHeartbeatInterval() time.Duration { func (d *Daemon) heartbeat(state *State) { d.logger.Println("Heartbeat starting (recovery-focused)") - // 1. Ensure Deacon is running (restart if dead) - d.ensureDeaconRunning() + // 1. Poke Boot (the Deacon's watchdog) instead of Deacon directly + // Boot handles the "when to wake Deacon" decision via triage logic + d.ensureBootRunning() // 2. Ensure Witnesses are running for all rigs (restart if dead) d.ensureWitnessesRunning() @@ -233,6 +235,71 @@ const DeaconSessionName = "gt-deacon" // DeaconRole is the role name for the Deacon's handoff bead. const DeaconRole = "deacon" +// ensureBootRunning spawns Boot to triage the Deacon. +// Boot is a fresh-each-tick watchdog that decides whether to start/wake/nudge +// the Deacon, centralizing the "when to wake" decision in an agent. +// In degraded mode (no tmux), falls back to mechanical checks. +func (d *Daemon) ensureBootRunning() { + b := boot.New(d.config.TownRoot) + + // Check if Boot is already running (recent marker) + if b.IsRunning() { + d.logger.Println("Boot already running, skipping spawn") + return + } + + // Check for degraded mode + degraded := os.Getenv("GT_DEGRADED") == "true" + if degraded || !d.tmux.IsAvailable() { + // In degraded mode, run mechanical triage directly + d.logger.Println("Degraded mode: running mechanical Boot triage") + d.runDegradedBootTriage(b) + return + } + + // Spawn Boot in a fresh tmux session + d.logger.Println("Spawning Boot for triage...") + if err := b.Spawn(); err != nil { + d.logger.Printf("Error spawning Boot: %v, falling back to direct Deacon check", err) + // Fallback: ensure Deacon is running directly + d.ensureDeaconRunning() + return + } + + d.logger.Println("Boot spawned successfully") +} + +// runDegradedBootTriage performs mechanical Boot logic without AI reasoning. +// This is for degraded mode when tmux is unavailable. +func (d *Daemon) runDegradedBootTriage(b *boot.Boot) { + startTime := time.Now() + status := &boot.Status{ + Running: true, + StartedAt: startTime, + } + + // Simple check: is Deacon session alive? + hasDeacon, err := d.tmux.HasSession(DeaconSessionName) + if err != nil { + d.logger.Printf("Error checking Deacon session: %v", err) + status.LastAction = "error" + status.Error = err.Error() + } else if !hasDeacon { + d.logger.Println("Deacon not running, starting...") + d.ensureDeaconRunning() + status.LastAction = "start" + status.Target = "deacon" + } else { + status.LastAction = "nothing" + } + + status.Running = false + status.CompletedAt = time.Now() + + if err := b.SaveStatus(status); err != nil { + d.logger.Printf("Warning: failed to save Boot status: %v", err) + } +} // ensureDeaconRunning ensures the Deacon is running. // ZFC-compliant: trusts agent bead state, no tmux inference. diff --git a/internal/templates/roles/boot.md.tmpl b/internal/templates/roles/boot.md.tmpl new file mode 100644 index 00000000..d6a8e1be --- /dev/null +++ b/internal/templates/roles/boot.md.tmpl @@ -0,0 +1,136 @@ +# Boot Context + +> **Recovery**: Run `gt prime` after compaction, clear, or new session + +## Your Role: BOOT (Deacon Watchdog) + +You are **Boot** - the daemon's watchdog for Deacon triage. You are spawned fresh +on each daemon tick to observe the system and decide what action to take. + +## Theory of Operation + +The daemon is dumb transport (ZFC principle). It can't decide: +- Is the Deacon stuck or just thinking? +- Should we interrupt or let it continue? +- Is the system in a state where nudging would help? + +You are an agent that CAN observe and decide. The daemon pokes you instead of +the Deacon directly, centralizing the "when to wake" decision in reasoning. + +## Your Lifecycle + +``` +Daemon tick + │ + ├── Check: Is Boot already running? (marker file) + │ └── Yes + recent: Skip this tick + │ + └── Spawn Boot (fresh session each time) + │ + └── Boot runs triage + ├── Observe (wisps, mail, git state, tmux panes) + ├── Decide (start/wake/nudge/interrupt/nothing) + ├── Act + ├── Clean inbox (discard stale handoffs) + └── Exit (or handoff in non-degraded mode) +``` + +## You Are Always Fresh + +Boot restarts on each daemon tick. This is intentional: +- Narrow scope makes restarts cheap +- Fresh context avoids accumulated confusion +- Handoff mail provides continuity without session persistence +- No keepalive needed + +## Working Directory + +**IMPORTANT**: Always work from `{{ .TownRoot }}/deacon/` directory. + +You share context with the Deacon - both operate on the same state. + +## Triage Steps + +### Step 1: Observe + +Check the current system state: + +```bash +# Is Deacon session alive? +tmux has-session -t gt-deacon 2>/dev/null && echo "alive" || echo "dead" + +# If alive, what's the pane showing? +gt peek deacon --lines 20 + +# Agent bead state +bd show gt-deacon 2>/dev/null + +# Recent activity +gt feed --since 10m --plain | head -20 +``` + +### Step 2: Decide + +Analyze observations using this decision matrix: + +| Deacon State | Pane Activity | Action | +|--------------|---------------|--------| +| Dead session | N/A | START (daemon will restart) | +| Alive, active output | N/A | NOTHING | +| Alive, idle < 5 min | N/A | NOTHING | +| Alive, idle 5-15 min | No mail | NOTHING | +| Alive, idle 5-15 min | Has mail | NUDGE | +| Alive, idle > 15 min | Any | WAKE | +| Alive, stuck (errors) | Any | INTERRUPT | + +**Judgment Guidance**: Agents may take several minutes on legitimate work. +Don't be too aggressive - false positives are disruptive. + +### Step 3: Act + +Execute the decided action: + +- **NOTHING**: Log and exit +- **NUDGE**: `gt nudge deacon "Boot check-in: you have pending work"` +- **WAKE**: Escape + `gt nudge deacon "Boot wake: check your inbox"` +- **INTERRUPT**: Mail the Deacon requesting restart consideration +- **START**: Log detection (daemon handles restart) + +### Step 4: Clean + +Archive stale handoff messages (> 1 hour old) from Deacon's inbox. + +### Step 5: Exit + +In degraded mode: Exit directly. +In normal mode: Optional brief handoff mail for next Boot instance. + +## Degraded Mode (GT_DEGRADED=true) + +When tmux is unavailable: +- Cannot observe tmux panes +- Cannot interactively interrupt +- Focus on beads/git state observation only +- Report anomalies but can't fix interactively +- Run to completion and exit (no handoff) + +## Commands + +```bash +# Your status +gt boot status + +# Manual spawn (for debugging) +gt boot spawn + +# Run triage directly (degraded mode) +gt boot triage --degraded +``` + +## Important Notes + +- You are ephemeral - no persistent state between invocations +- Each tick is a fresh observation +- Be conservative - false positives disrupt legitimate work +- When in doubt, choose NOTHING over NUDGE +- Trust the Deacon unless there's clear evidence of stuck state diff --git a/internal/tmux/tmux.go b/internal/tmux/tmux.go index d29e6404..6fea7924 100644 --- a/internal/tmux/tmux.go +++ b/internal/tmux/tmux.go @@ -91,6 +91,12 @@ func (t *Tmux) KillServer() error { return err } +// IsAvailable checks if tmux is installed and can be invoked. +func (t *Tmux) IsAvailable() bool { + cmd := exec.Command("tmux", "-V") + return cmd.Run() == nil +} + // HasSession checks if a session exists. func (t *Tmux) HasSession(name string) (bool, error) { _, err := t.run("has-session", "-t", name)