refactor: ZFC cleanup - move Go heuristics to Deacon molecule (gt-gaxo)

Remove Go code that makes workflow decisions. All health checking,
staleness detection, nudging, and escalation belongs in the Deacon
molecule where Claude executes it.

Removed:
- internal/daemon/backoff.go (190 lines) - exponential backoff decisions
- internal/doctor/stale_check.go (284 lines) - staleness detection
- IsFresh/IsStale/IsVeryStale from keepalive.go
- pokeMayor, pokeWitnesses, pokeWitness from daemon.go
- Heartbeat staleness classification from pokeDeacon

Changed:
- Lifecycle parsing now uses structured body (JSON or simple text)
  instead of keyword matching on subject line
- Daemon now only ensures Deacon is running and sends simple heartbeats
- No backoff, no staleness classification, no decision-making

Total: ~800 lines removed from Go code

The Deacon molecule will handle all health checking, nudging, and
escalation. Go is now just a message router.

See gt-gaxo epic for full rationale.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-12-24 00:11:15 -08:00
parent 0f88c793f8
commit b6817899b4
13 changed files with 145 additions and 1224 deletions

View File

@@ -1,190 +0,0 @@
package daemon
import (
"time"
)
// BackoffStrategy defines how intervals grow.
type BackoffStrategy string
const (
// StrategyFixed keeps the same interval (no backoff).
StrategyFixed BackoffStrategy = "fixed"
// StrategyGeometric multiplies by a factor each miss (1.5x).
StrategyGeometric BackoffStrategy = "geometric"
// StrategyExponential doubles interval each miss (2x).
StrategyExponential BackoffStrategy = "exponential"
)
// BackoffConfig holds backoff configuration.
type BackoffConfig struct {
// Strategy determines how intervals grow.
Strategy BackoffStrategy
// BaseInterval is the starting interval (default 60s).
BaseInterval time.Duration
// MaxInterval is the cap on how large intervals can grow (default 10m).
MaxInterval time.Duration
// Factor is the multiplier for geometric backoff (default 1.5).
Factor float64
}
// DefaultBackoffConfig returns sensible defaults.
// Base interval is 5 minutes since deacon rounds may take a while
// (health checks, plugins, syncing clones, complex remediation).
// Max interval is 30 minutes - beyond that, something is likely wrong.
func DefaultBackoffConfig() *BackoffConfig {
return &BackoffConfig{
Strategy: StrategyGeometric,
BaseInterval: 5 * time.Minute,
MaxInterval: 30 * time.Minute,
Factor: 1.5,
}
}
// AgentBackoff tracks backoff state for a single agent.
type AgentBackoff struct {
// AgentID identifies the agent (e.g., "mayor", "gastown-witness").
AgentID string
// BaseInterval is the starting interval.
BaseInterval time.Duration
// CurrentInterval is the current (possibly backed-off) interval.
CurrentInterval time.Duration
// MaxInterval caps how large intervals can grow.
MaxInterval time.Duration
// ConsecutiveMiss counts pokes with no response.
ConsecutiveMiss int
// LastPoke is when we last poked this agent.
LastPoke time.Time
// LastActivity is when the agent last showed activity.
LastActivity time.Time
}
// NewAgentBackoff creates backoff state for an agent.
func NewAgentBackoff(agentID string, config *BackoffConfig) *AgentBackoff {
if config == nil {
config = DefaultBackoffConfig()
}
return &AgentBackoff{
AgentID: agentID,
BaseInterval: config.BaseInterval,
CurrentInterval: config.BaseInterval,
MaxInterval: config.MaxInterval,
}
}
// ShouldPoke returns true if enough time has passed since the last poke.
func (ab *AgentBackoff) ShouldPoke() bool {
if ab.LastPoke.IsZero() {
return true // Never poked
}
return time.Since(ab.LastPoke) >= ab.CurrentInterval
}
// RecordPoke records that we poked the agent.
func (ab *AgentBackoff) RecordPoke() {
ab.LastPoke = time.Now()
}
// RecordMiss records that the agent didn't respond since last poke.
// This increases the backoff interval.
func (ab *AgentBackoff) RecordMiss(config *BackoffConfig) {
ab.ConsecutiveMiss++
if config == nil {
config = DefaultBackoffConfig()
}
switch config.Strategy {
case StrategyFixed:
// No change
case StrategyGeometric:
ab.CurrentInterval = time.Duration(float64(ab.CurrentInterval) * config.Factor)
case StrategyExponential:
ab.CurrentInterval = ab.CurrentInterval * 2
}
// Cap at max interval
if ab.CurrentInterval > ab.MaxInterval {
ab.CurrentInterval = ab.MaxInterval
}
}
// RecordActivity records that the agent showed activity.
// This resets the backoff to the base interval.
func (ab *AgentBackoff) RecordActivity() {
ab.ConsecutiveMiss = 0
ab.CurrentInterval = ab.BaseInterval
ab.LastActivity = time.Now()
}
// BackoffManager tracks backoff state for all agents.
type BackoffManager struct {
config *BackoffConfig
agents map[string]*AgentBackoff
}
// NewBackoffManager creates a new backoff manager.
func NewBackoffManager(config *BackoffConfig) *BackoffManager {
if config == nil {
config = DefaultBackoffConfig()
}
return &BackoffManager{
config: config,
agents: make(map[string]*AgentBackoff),
}
}
// GetOrCreate returns backoff state for an agent, creating if needed.
func (bm *BackoffManager) GetOrCreate(agentID string) *AgentBackoff {
if ab, ok := bm.agents[agentID]; ok {
return ab
}
ab := NewAgentBackoff(agentID, bm.config)
bm.agents[agentID] = ab
return ab
}
// ShouldPoke returns true if we should poke the given agent.
func (bm *BackoffManager) ShouldPoke(agentID string) bool {
return bm.GetOrCreate(agentID).ShouldPoke()
}
// RecordPoke records that we poked an agent.
func (bm *BackoffManager) RecordPoke(agentID string) {
bm.GetOrCreate(agentID).RecordPoke()
}
// RecordMiss records that an agent didn't respond.
func (bm *BackoffManager) RecordMiss(agentID string) {
bm.GetOrCreate(agentID).RecordMiss(bm.config)
}
// RecordActivity records that an agent showed activity.
func (bm *BackoffManager) RecordActivity(agentID string) {
bm.GetOrCreate(agentID).RecordActivity()
}
// GetInterval returns the current interval for an agent.
func (bm *BackoffManager) GetInterval(agentID string) time.Duration {
return bm.GetOrCreate(agentID).CurrentInterval
}
// Stats returns a map of agent ID to current interval for logging.
func (bm *BackoffManager) Stats() map[string]time.Duration {
stats := make(map[string]time.Duration, len(bm.agents))
for id, ab := range bm.agents {
stats[id] = ab.CurrentInterval
}
return stats
}

View File

@@ -1,290 +0,0 @@
package daemon
import (
"testing"
"time"
)
func TestDefaultBackoffConfig(t *testing.T) {
config := DefaultBackoffConfig()
if config.Strategy != StrategyGeometric {
t.Errorf("expected strategy Geometric, got %v", config.Strategy)
}
if config.BaseInterval != 5*time.Minute {
t.Errorf("expected base interval 5m, got %v", config.BaseInterval)
}
if config.MaxInterval != 30*time.Minute {
t.Errorf("expected max interval 30m, got %v", config.MaxInterval)
}
if config.Factor != 1.5 {
t.Errorf("expected factor 1.5, got %v", config.Factor)
}
}
func TestNewAgentBackoff(t *testing.T) {
config := DefaultBackoffConfig()
ab := NewAgentBackoff("test-agent", config)
if ab.AgentID != "test-agent" {
t.Errorf("expected agent ID 'test-agent', got %s", ab.AgentID)
}
if ab.BaseInterval != 5*time.Minute {
t.Errorf("expected base interval 5m, got %v", ab.BaseInterval)
}
if ab.CurrentInterval != 5*time.Minute {
t.Errorf("expected current interval 5m, got %v", ab.CurrentInterval)
}
if ab.ConsecutiveMiss != 0 {
t.Errorf("expected consecutive miss 0, got %d", ab.ConsecutiveMiss)
}
}
func TestAgentBackoff_ShouldPoke(t *testing.T) {
config := &BackoffConfig{
Strategy: StrategyGeometric,
BaseInterval: 100 * time.Millisecond, // Short for testing
MaxInterval: 1 * time.Second,
Factor: 1.5,
}
ab := NewAgentBackoff("test", config)
// Should poke immediately (never poked)
if !ab.ShouldPoke() {
t.Error("expected ShouldPoke=true for new agent")
}
// Record a poke
ab.RecordPoke()
// Should not poke immediately after
if ab.ShouldPoke() {
t.Error("expected ShouldPoke=false immediately after poke")
}
// Wait for interval
time.Sleep(110 * time.Millisecond)
// Now should poke again
if !ab.ShouldPoke() {
t.Error("expected ShouldPoke=true after interval elapsed")
}
}
func TestAgentBackoff_GeometricBackoff(t *testing.T) {
config := &BackoffConfig{
Strategy: StrategyGeometric,
BaseInterval: 100 * time.Millisecond,
MaxInterval: 1 * time.Second,
Factor: 1.5,
}
ab := NewAgentBackoff("test", config)
// Initial interval
if ab.CurrentInterval != 100*time.Millisecond {
t.Errorf("expected initial interval 100ms, got %v", ab.CurrentInterval)
}
// First miss: 100ms * 1.5 = 150ms
ab.RecordMiss(config)
if ab.CurrentInterval != 150*time.Millisecond {
t.Errorf("expected interval 150ms after 1 miss, got %v", ab.CurrentInterval)
}
if ab.ConsecutiveMiss != 1 {
t.Errorf("expected consecutive miss 1, got %d", ab.ConsecutiveMiss)
}
// Second miss: 150ms * 1.5 = 225ms
ab.RecordMiss(config)
if ab.CurrentInterval != 225*time.Millisecond {
t.Errorf("expected interval 225ms after 2 misses, got %v", ab.CurrentInterval)
}
// Third miss: 225ms * 1.5 = 337.5ms
ab.RecordMiss(config)
expected := time.Duration(337500000) // 337.5ms in nanoseconds
if ab.CurrentInterval != expected {
t.Errorf("expected interval ~337.5ms after 3 misses, got %v", ab.CurrentInterval)
}
}
func TestAgentBackoff_ExponentialBackoff(t *testing.T) {
config := &BackoffConfig{
Strategy: StrategyExponential,
BaseInterval: 100 * time.Millisecond,
MaxInterval: 1 * time.Second,
Factor: 2.0, // Ignored for exponential
}
ab := NewAgentBackoff("test", config)
// First miss: 100ms * 2 = 200ms
ab.RecordMiss(config)
if ab.CurrentInterval != 200*time.Millisecond {
t.Errorf("expected interval 200ms after 1 miss, got %v", ab.CurrentInterval)
}
// Second miss: 200ms * 2 = 400ms
ab.RecordMiss(config)
if ab.CurrentInterval != 400*time.Millisecond {
t.Errorf("expected interval 400ms after 2 misses, got %v", ab.CurrentInterval)
}
// Third miss: 400ms * 2 = 800ms
ab.RecordMiss(config)
if ab.CurrentInterval != 800*time.Millisecond {
t.Errorf("expected interval 800ms after 3 misses, got %v", ab.CurrentInterval)
}
}
func TestAgentBackoff_FixedStrategy(t *testing.T) {
config := &BackoffConfig{
Strategy: StrategyFixed,
BaseInterval: 100 * time.Millisecond,
MaxInterval: 1 * time.Second,
Factor: 1.5,
}
ab := NewAgentBackoff("test", config)
// Multiple misses should not change interval
ab.RecordMiss(config)
ab.RecordMiss(config)
ab.RecordMiss(config)
if ab.CurrentInterval != 100*time.Millisecond {
t.Errorf("expected interval to stay at 100ms with fixed strategy, got %v", ab.CurrentInterval)
}
if ab.ConsecutiveMiss != 3 {
t.Errorf("expected consecutive miss 3, got %d", ab.ConsecutiveMiss)
}
}
func TestAgentBackoff_MaxInterval(t *testing.T) {
config := &BackoffConfig{
Strategy: StrategyExponential,
BaseInterval: 100 * time.Millisecond,
MaxInterval: 500 * time.Millisecond,
Factor: 2.0,
}
ab := NewAgentBackoff("test", config)
// Keep missing until we hit the cap
for i := 0; i < 10; i++ {
ab.RecordMiss(config)
}
if ab.CurrentInterval != 500*time.Millisecond {
t.Errorf("expected interval capped at 500ms, got %v", ab.CurrentInterval)
}
}
func TestAgentBackoff_RecordActivity(t *testing.T) {
config := &BackoffConfig{
Strategy: StrategyGeometric,
BaseInterval: 100 * time.Millisecond,
MaxInterval: 1 * time.Second,
Factor: 1.5,
}
ab := NewAgentBackoff("test", config)
// Build up some backoff
ab.RecordMiss(config)
ab.RecordMiss(config)
ab.RecordMiss(config)
if ab.CurrentInterval == 100*time.Millisecond {
t.Error("expected interval to have increased")
}
if ab.ConsecutiveMiss != 3 {
t.Errorf("expected consecutive miss 3, got %d", ab.ConsecutiveMiss)
}
// Record activity - should reset
ab.RecordActivity()
if ab.CurrentInterval != 100*time.Millisecond {
t.Errorf("expected interval reset to 100ms, got %v", ab.CurrentInterval)
}
if ab.ConsecutiveMiss != 0 {
t.Errorf("expected consecutive miss reset to 0, got %d", ab.ConsecutiveMiss)
}
if ab.LastActivity.IsZero() {
t.Error("expected LastActivity to be set")
}
}
func TestBackoffManager_GetOrCreate(t *testing.T) {
bm := NewBackoffManager(DefaultBackoffConfig())
// First call creates
ab1 := bm.GetOrCreate("agent1")
if ab1 == nil {
t.Fatal("expected agent backoff to be created")
}
if ab1.AgentID != "agent1" {
t.Errorf("expected agent ID 'agent1', got %s", ab1.AgentID)
}
// Second call returns same instance
ab2 := bm.GetOrCreate("agent1")
if ab1 != ab2 {
t.Error("expected same instance on second call")
}
// Different agent creates new instance
ab3 := bm.GetOrCreate("agent2")
if ab1 == ab3 {
t.Error("expected different instance for different agent")
}
}
func TestBackoffManager_Stats(t *testing.T) {
config := &BackoffConfig{
Strategy: StrategyGeometric,
BaseInterval: 100 * time.Millisecond,
MaxInterval: 1 * time.Second,
Factor: 1.5,
}
bm := NewBackoffManager(config)
// Create some agents with different backoff states
bm.RecordPoke("agent1")
bm.RecordMiss("agent1")
bm.RecordPoke("agent2")
bm.RecordMiss("agent2")
bm.RecordMiss("agent2")
stats := bm.Stats()
if len(stats) != 2 {
t.Errorf("expected 2 agents in stats, got %d", len(stats))
}
// agent1: 100ms * 1.5 = 150ms
if stats["agent1"] != 150*time.Millisecond {
t.Errorf("expected agent1 interval 150ms, got %v", stats["agent1"])
}
// agent2: 100ms * 1.5 * 1.5 = 225ms
if stats["agent2"] != 225*time.Millisecond {
t.Errorf("expected agent2 interval 225ms, got %v", stats["agent2"])
}
}
func TestExtractRigName(t *testing.T) {
tests := []struct {
session string
expected string
}{
{"gt-gastown-witness", "gastown"},
{"gt-myrig-witness", "myrig"},
{"gt-my-rig-name-witness", "my-rig-name"},
}
for _, tc := range tests {
result := extractRigName(tc.session)
if result != tc.expected {
t.Errorf("extractRigName(%q) = %q, expected %q", tc.session, result, tc.expected)
}
}
}

View File

@@ -2,34 +2,27 @@ package daemon
import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"os/signal"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/git"
"github.com/steveyegge/gastown/internal/keepalive"
"github.com/steveyegge/gastown/internal/rig"
"github.com/steveyegge/gastown/internal/tmux"
)
// Daemon is the town-level background service.
// Its only job is to ensure Deacon is running and send periodic heartbeats.
// All health checking, nudging, and decision-making belongs in the Deacon molecule.
type Daemon struct {
config *Config
tmux *tmux.Tmux
logger *log.Logger
ctx context.Context
cancel context.CancelFunc
backoff *BackoffManager
notifications *NotificationManager
lastMOTDIndex int // tracks last MOTD to avoid consecutive repeats
}
@@ -50,18 +43,12 @@ func New(config *Config) (*Daemon, error) {
logger := log.New(logFile, "", log.LstdFlags)
ctx, cancel := context.WithCancel(context.Background())
// Initialize notification manager for slot-based deduplication
notifDir := filepath.Join(daemonDir, "notifications")
notifMaxAge := 5 * time.Minute // Notifications expire after 5 minutes
return &Daemon{
config: config,
tmux: tmux.NewTmux(),
logger: logger,
ctx: ctx,
cancel: cancel,
backoff: NewBackoffManager(DefaultBackoffConfig()),
notifications: NewNotificationManager(notifDir, notifMaxAge),
config: config,
tmux: tmux.NewTmux(),
logger: logger,
ctx: ctx,
cancel: cancel,
}, nil
}
@@ -121,17 +108,15 @@ func (d *Daemon) Run() error {
}
// heartbeat performs one heartbeat cycle.
// The daemon's job is minimal: ensure Deacon is running and send heartbeats.
// All health checking and decision-making belongs in the Deacon molecule.
func (d *Daemon) heartbeat(state *State) {
d.logger.Println("Heartbeat starting")
// 0. Clean up stale notification slots periodically
_ = d.notifications.ClearStaleSlots()
// 1. Ensure Deacon is running (the Deacon is the heartbeat of the system)
// 1. Ensure Deacon is running (process management)
d.ensureDeaconRunning()
// 2. Poke Deacon - the Deacon monitors Mayor and Witnesses
// Note: Deacon self-spawns wisps for patrol cycles (no daemon attachment needed)
// 2. Send heartbeat to Deacon (simple notification, no decision-making)
d.pokeDeacon()
// 3. Process lifecycle requests
@@ -243,10 +228,9 @@ func (d *Daemon) ensureDeaconRunning() {
}
// pokeDeacon sends a heartbeat message to the Deacon session.
// The Deacon is responsible for monitoring Mayor and Witnesses.
// Simple notification - no staleness checking or backoff logic.
// The Deacon molecule decides what to do with heartbeats.
func (d *Daemon) pokeDeacon() {
const agentID = "deacon"
running, err := d.tmux.HasSession(DeaconSessionName)
if err != nil {
d.logger.Printf("Error checking Deacon session: %v", err)
@@ -258,49 +242,6 @@ func (d *Daemon) pokeDeacon() {
return
}
// Check deacon heartbeat to see if it's active
deaconHeartbeatFile := filepath.Join(d.config.TownRoot, "deacon", "heartbeat.json")
var isFresh, isStale, isVeryStale bool
data, err := os.ReadFile(deaconHeartbeatFile)
if err == nil {
var hb struct {
Timestamp time.Time `json:"timestamp"`
}
if json.Unmarshal(data, &hb) == nil {
age := time.Since(hb.Timestamp)
isFresh = age < 2*time.Minute
isStale = age >= 2*time.Minute && age < 5*time.Minute
isVeryStale = age >= 5*time.Minute
} else {
isVeryStale = true
}
} else {
isVeryStale = true // No heartbeat file
}
if isFresh {
// Deacon is actively working, reset backoff and mark notifications consumed
d.backoff.RecordActivity(agentID)
_ = d.notifications.MarkConsumed(DeaconSessionName, SlotHeartbeat)
d.logger.Println("Deacon is fresh, skipping poke")
return
}
// Check if we should poke based on backoff interval
if !d.backoff.ShouldPoke(agentID) {
interval := d.backoff.GetInterval(agentID)
d.logger.Printf("Deacon backoff in effect (interval: %v), skipping poke", interval)
return
}
// Check if we should send (slot-based deduplication)
shouldSend, _ := d.notifications.ShouldSend(DeaconSessionName, SlotHeartbeat)
if !shouldSend {
d.logger.Println("Heartbeat already pending for Deacon, skipping")
return
}
// Send heartbeat message with rotating MOTD
motd := d.nextMOTD()
msg := fmt.Sprintf("HEARTBEAT: %s", motd)
@@ -309,253 +250,12 @@ func (d *Daemon) pokeDeacon() {
return
}
// Record the send for slot deduplication
_ = d.notifications.RecordSend(DeaconSessionName, SlotHeartbeat, msg)
d.backoff.RecordPoke(agentID)
// Adjust backoff based on staleness
if isVeryStale {
d.backoff.RecordMiss(agentID)
interval := d.backoff.GetInterval(agentID)
d.logger.Printf("Poked Deacon (very stale, backoff now: %v)", interval)
} else if isStale {
d.logger.Println("Poked Deacon (stale)")
} else {
d.logger.Println("Poked Deacon")
}
d.logger.Println("Poked Deacon")
}
// pokeMayor sends a heartbeat to the Mayor session.
func (d *Daemon) pokeMayor() {
mayorSession := constants.SessionMayor
agentID := constants.RoleMayor
running, err := d.tmux.HasSession(mayorSession)
if err != nil {
d.logger.Printf("Error checking Mayor session: %v", err)
return
}
if !running {
d.logger.Println("Mayor session not running, skipping poke")
return
}
// Check keepalive to see if agent is active
state := keepalive.Read(d.config.TownRoot)
if state != nil && state.IsFresh() {
// Agent is actively working, reset backoff and mark notifications consumed
d.backoff.RecordActivity(agentID)
_ = d.notifications.MarkConsumed(mayorSession, SlotHeartbeat)
d.logger.Printf("Mayor is fresh (cmd: %s), skipping poke", state.LastCommand)
return
}
// Check if we should poke based on backoff interval
if !d.backoff.ShouldPoke(agentID) {
interval := d.backoff.GetInterval(agentID)
d.logger.Printf("Mayor backoff in effect (interval: %v), skipping poke", interval)
return
}
// Check if we should send (slot-based deduplication)
shouldSend, _ := d.notifications.ShouldSend(mayorSession, SlotHeartbeat)
if !shouldSend {
d.logger.Println("Heartbeat already pending for Mayor, skipping")
return
}
// Send heartbeat message via tmux, replacing any pending input
msg := "HEARTBEAT: check your rigs"
if err := d.tmux.SendKeysReplace(mayorSession, msg, 50); err != nil {
d.logger.Printf("Error poking Mayor: %v", err)
return
}
// Record the send for slot deduplication
_ = d.notifications.RecordSend(mayorSession, SlotHeartbeat, msg)
d.backoff.RecordPoke(agentID)
// If agent is stale or very stale, record a miss (increase backoff)
if state == nil || state.IsVeryStale() {
d.backoff.RecordMiss(agentID)
interval := d.backoff.GetInterval(agentID)
d.logger.Printf("Poked Mayor (very stale, backoff now: %v)", interval)
} else if state.IsStale() {
// Stale but not very stale - don't increase backoff, but don't reset either
d.logger.Println("Poked Mayor (stale)")
} else {
d.logger.Println("Poked Mayor")
}
}
// pokeWitnesses sends heartbeats to all Witness sessions.
// Uses proper rig discovery from rigs.json instead of scanning tmux sessions.
func (d *Daemon) pokeWitnesses() {
// Discover rigs from configuration
rigs := d.discoverRigs()
if len(rigs) == 0 {
d.logger.Println("No rigs discovered")
return
}
for _, r := range rigs {
session := fmt.Sprintf("gt-%s-witness", r.Name)
// Check if witness session exists
running, err := d.tmux.HasSession(session)
if err != nil {
d.logger.Printf("Error checking witness session for rig %s: %v", r.Name, err)
continue
}
if !running {
// Rig exists but no witness session - log for visibility
d.logger.Printf("Rig %s has no witness session (may need: gt witness start %s)", r.Name, r.Name)
continue
}
d.pokeWitness(session)
}
}
// discoverRigs finds all registered rigs using the rig manager.
// Falls back to directory scanning if rigs.json is not available.
func (d *Daemon) discoverRigs() []*rig.Rig {
// Load rigs config from mayor/rigs.json
rigsConfigPath := constants.MayorRigsPath(d.config.TownRoot)
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
if err != nil {
// Try fallback: scan town directory for rig directories
return d.discoverRigsFromDirectory()
}
// Use rig manager for proper discovery
g := git.NewGit(d.config.TownRoot)
mgr := rig.NewManager(d.config.TownRoot, rigsConfig, g)
rigs, err := mgr.DiscoverRigs()
if err != nil {
d.logger.Printf("Error discovering rigs from config: %v", err)
return d.discoverRigsFromDirectory()
}
return rigs
}
// discoverRigsFromDirectory scans the town directory for rig directories.
// A directory is considered a rig if it has a .beads subdirectory or config.json.
func (d *Daemon) discoverRigsFromDirectory() []*rig.Rig {
entries, err := os.ReadDir(d.config.TownRoot)
if err != nil {
d.logger.Printf("Error reading town directory: %v", err)
return nil
}
var rigs []*rig.Rig
for _, entry := range entries {
if !entry.IsDir() {
continue
}
name := entry.Name()
// Skip known non-rig directories
if name == "mayor" || name == "daemon" || name == ".git" || name[0] == '.' {
continue
}
dirPath := filepath.Join(d.config.TownRoot, name)
// Check for .beads directory (indicates a rig)
beadsPath := filepath.Join(dirPath, ".beads")
if _, err := os.Stat(beadsPath); err == nil {
rigs = append(rigs, &rig.Rig{Name: name, Path: dirPath})
continue
}
// Check for config.json with type: rig
configPath := filepath.Join(dirPath, "config.json")
if _, err := os.Stat(configPath); err == nil {
// For simplicity, assume any directory with config.json is a rig
rigs = append(rigs, &rig.Rig{Name: name, Path: dirPath})
}
}
return rigs
}
// pokeWitness sends a heartbeat to a single witness session with backoff.
func (d *Daemon) pokeWitness(session string) {
// Extract rig name from session (gt-<rig>-witness -> <rig>)
rigName := extractRigName(session)
agentID := session // Use session name as agent ID
// Find the rig's workspace for keepalive check
rigWorkspace := filepath.Join(d.config.TownRoot, "gastown", rigName)
// Check keepalive to see if the witness is active
state := keepalive.Read(rigWorkspace)
if state != nil && state.IsFresh() {
// Witness is actively working, reset backoff and mark notifications consumed
d.backoff.RecordActivity(agentID)
_ = d.notifications.MarkConsumed(session, SlotHeartbeat)
d.logger.Printf("Witness %s is fresh (cmd: %s), skipping poke", session, state.LastCommand)
return
}
// Check if we should poke based on backoff interval
if !d.backoff.ShouldPoke(agentID) {
interval := d.backoff.GetInterval(agentID)
d.logger.Printf("Witness %s backoff in effect (interval: %v), skipping poke", session, interval)
return
}
// Check if we should send (slot-based deduplication)
shouldSend, _ := d.notifications.ShouldSend(session, SlotHeartbeat)
if !shouldSend {
d.logger.Printf("Heartbeat already pending for Witness %s, skipping", session)
return
}
// Send heartbeat message, replacing any pending input
msg := "HEARTBEAT: check your workers"
if err := d.tmux.SendKeysReplace(session, msg, 50); err != nil {
d.logger.Printf("Error poking Witness %s: %v", session, err)
return
}
// Record the send for slot deduplication
_ = d.notifications.RecordSend(session, SlotHeartbeat, msg)
d.backoff.RecordPoke(agentID)
// If agent is stale or very stale, record a miss (increase backoff)
if state == nil || state.IsVeryStale() {
d.backoff.RecordMiss(agentID)
interval := d.backoff.GetInterval(agentID)
d.logger.Printf("Poked Witness %s (very stale, backoff now: %v)", session, interval)
} else if state.IsStale() {
d.logger.Printf("Poked Witness %s (stale)", session)
} else {
d.logger.Printf("Poked Witness %s", session)
}
}
// extractRigName extracts the rig name from a witness session name.
// "gt-gastown-witness" -> "gastown"
func extractRigName(session string) string {
// Remove "gt-" prefix and "-witness" suffix
name := strings.TrimPrefix(session, "gt-")
name = strings.TrimSuffix(name, "-witness")
return name
}
// isWitnessSession checks if a session name is a witness session.
func isWitnessSession(name string) bool {
// Pattern: gt-<rig>-witness
if len(name) < 12 { // "gt-x-witness" minimum
return false
}
return name[:3] == "gt-" && name[len(name)-8:] == "-witness"
}
// NOTE: pokeMayor, pokeWitnesses, and pokeWitness have been removed.
// The Deacon molecule is responsible for monitoring Mayor and Witnesses.
// The daemon only ensures Deacon is running and sends it heartbeats.
// processLifecycleRequests checks for and processes lifecycle requests.
func (d *Daemon) processLifecycleRequests() {

View File

@@ -206,32 +206,8 @@ func TestSaveLoadState_Roundtrip(t *testing.T) {
}
}
func TestIsWitnessSession(t *testing.T) {
tests := []struct {
name string
expected bool
}{
{"gt-gastown-witness", true},
{"gt-myrig-witness", true},
{"gt-my-rig-name-witness", true},
{"gt-a-witness", true}, // minimum valid
{"gt-witness", false}, // no rig name
{"gastown-witness", false}, // missing gt- prefix
{"gt-gastown", false}, // missing -witness suffix
{"gt-mayor", false}, // not a witness
{"random-session", false},
{"", false},
{"gt-", false},
{"witness", false},
}
for _, tc := range tests {
result := isWitnessSession(tc.name)
if result != tc.expected {
t.Errorf("isWitnessSession(%q) = %v, expected %v", tc.name, result, tc.expected)
}
}
}
// NOTE: TestIsWitnessSession removed - isWitnessSession function was deleted
// as part of ZFC cleanup (gt-gaxo). Witness poking is now Deacon's responsibility.
func TestLifecycleAction_Constants(t *testing.T) {
// Verify constants have expected string values

View File

@@ -70,46 +70,55 @@ func (d *Daemon) ProcessLifecycleRequests() {
}
}
// parseLifecycleRequest extracts a lifecycle request from a message.
func (d *Daemon) parseLifecycleRequest(msg *BeadsMessage) *LifecycleRequest {
// Look for lifecycle keywords in subject
// Expected format: "LIFECYCLE: <role> requesting <action>"
subject := strings.ToLower(msg.Subject)
// LifecycleBody is the structured body format for lifecycle requests.
// Claude should send mail with JSON body: {"action": "cycle"} or {"action": "shutdown"}
type LifecycleBody struct {
Action string `json:"action"`
}
// parseLifecycleRequest extracts a lifecycle request from a message.
// Uses structured body parsing instead of keyword matching on subject.
func (d *Daemon) parseLifecycleRequest(msg *BeadsMessage) *LifecycleRequest {
// Gate: subject must start with "LIFECYCLE:"
subject := strings.ToLower(msg.Subject)
if !strings.HasPrefix(subject, "lifecycle:") {
return nil
}
var action LifecycleAction
var from string
// Parse structured body for action
var body LifecycleBody
if err := json.Unmarshal([]byte(msg.Body), &body); err != nil {
// Fallback: check for simple action strings in body
bodyLower := strings.ToLower(strings.TrimSpace(msg.Body))
switch {
case bodyLower == "restart" || bodyLower == "action: restart":
body.Action = "restart"
case bodyLower == "shutdown" || bodyLower == "action: shutdown" || bodyLower == "stop":
body.Action = "shutdown"
case bodyLower == "cycle" || bodyLower == "action: cycle":
body.Action = "cycle"
default:
d.logger.Printf("Lifecycle request with unparseable body: %q", msg.Body)
return nil
}
}
// Check restart/shutdown before cycle.
// Note: Can't use Contains(subject, "cycle") because "lifecycle:" contains "cycle".
// Use " cycle" (with leading space) to match the word, not the prefix.
if strings.Contains(subject, "restart") {
// Map action string to enum
var action LifecycleAction
switch strings.ToLower(body.Action) {
case "restart":
action = ActionRestart
} else if strings.Contains(subject, "shutdown") || strings.Contains(subject, "stop") {
case "shutdown", "stop":
action = ActionShutdown
} else if strings.Contains(subject, " cycle") || strings.Contains(subject, "cycling") {
case "cycle":
action = ActionCycle
} else {
default:
d.logger.Printf("Unknown lifecycle action: %q", body.Action)
return nil
}
// Extract role from subject: "LIFECYCLE: <role> requesting ..."
// Parse between "lifecycle: " and " requesting"
parts := strings.Split(subject, " requesting")
if len(parts) >= 1 {
rolePart := strings.TrimPrefix(parts[0], "lifecycle:")
from = strings.TrimSpace(rolePart)
}
if from == "" {
from = msg.From // fallback
}
return &LifecycleRequest{
From: from,
From: msg.From,
Action: action,
Timestamp: time.Now(),
}

View File

@@ -1,14 +1,16 @@
package daemon
import (
"io"
"log"
"testing"
)
// testDaemon creates a minimal Daemon for testing.
// We only need the struct to call methods on it.
func testDaemon() *Daemon {
return &Daemon{
config: &Config{TownRoot: "/tmp/test"},
logger: log.New(io.Discard, "", 0), // silent logger for tests
}
}
@@ -16,59 +18,62 @@ func TestParseLifecycleRequest_Cycle(t *testing.T) {
d := testDaemon()
tests := []struct {
title string
subject string
body string
expected LifecycleAction
}{
// Explicit cycle requests
{"LIFECYCLE: mayor requesting cycle", ActionCycle},
{"lifecycle: gastown-witness requesting cycling", ActionCycle},
{"LIFECYCLE: witness requesting cycle now", ActionCycle},
// JSON body format
{"LIFECYCLE: requesting action", `{"action": "cycle"}`, ActionCycle},
// Simple text body format
{"LIFECYCLE: requesting action", "cycle", ActionCycle},
{"lifecycle: action request", "action: cycle", ActionCycle},
}
for _, tc := range tests {
msg := &BeadsMessage{
Subject: tc.title,
Subject: tc.subject,
Body: tc.body,
From: "test-sender",
}
result := d.parseLifecycleRequest(msg)
if result == nil {
t.Errorf("parseLifecycleRequest(%q) returned nil, expected action %s", tc.title, tc.expected)
t.Errorf("parseLifecycleRequest(subject=%q, body=%q) returned nil, expected action %s", tc.subject, tc.body, tc.expected)
continue
}
if result.Action != tc.expected {
t.Errorf("parseLifecycleRequest(%q) action = %s, expected %s", tc.title, result.Action, tc.expected)
t.Errorf("parseLifecycleRequest(subject=%q, body=%q) action = %s, expected %s", tc.subject, tc.body, result.Action, tc.expected)
}
}
}
func TestParseLifecycleRequest_RestartAndShutdown(t *testing.T) {
// Verify that restart and shutdown are correctly parsed.
// Previously, the "lifecycle:" prefix contained "cycle", which caused
// all messages to match as cycle. Fixed by checking restart/shutdown
// before cycle, and using " cycle" (with space) to avoid prefix match.
// Verify that restart and shutdown are correctly parsed using structured body.
d := testDaemon()
tests := []struct {
title string
subject string
body string
expected LifecycleAction
}{
{"LIFECYCLE: mayor requesting restart", ActionRestart},
{"LIFECYCLE: mayor requesting shutdown", ActionShutdown},
{"lifecycle: witness requesting stop", ActionShutdown},
{"LIFECYCLE: action", `{"action": "restart"}`, ActionRestart},
{"LIFECYCLE: action", `{"action": "shutdown"}`, ActionShutdown},
{"lifecycle: action", "stop", ActionShutdown},
{"LIFECYCLE: action", "restart", ActionRestart},
}
for _, tc := range tests {
msg := &BeadsMessage{
Subject: tc.title,
Subject: tc.subject,
Body: tc.body,
From: "test-sender",
}
result := d.parseLifecycleRequest(msg)
if result == nil {
t.Errorf("parseLifecycleRequest(%q) returned nil", tc.title)
t.Errorf("parseLifecycleRequest(subject=%q, body=%q) returned nil", tc.subject, tc.body)
continue
}
if result.Action != tc.expected {
t.Errorf("parseLifecycleRequest(%q) action = %s, expected %s", tc.title, result.Action, tc.expected)
t.Errorf("parseLifecycleRequest(subject=%q, body=%q) action = %s, expected %s", tc.subject, tc.body, result.Action, tc.expected)
}
}
}
@@ -96,52 +101,53 @@ func TestParseLifecycleRequest_NotLifecycle(t *testing.T) {
}
}
func TestParseLifecycleRequest_ExtractsFrom(t *testing.T) {
func TestParseLifecycleRequest_UsesFromField(t *testing.T) {
d := testDaemon()
// Now that we use structured body, the From field comes directly from the message
tests := []struct {
title string
subject string
body string
sender string
expectedFrom string
}{
{"LIFECYCLE: mayor requesting cycle", "fallback", "mayor"},
{"LIFECYCLE: gastown-witness requesting restart", "fallback", "gastown-witness"},
{"lifecycle: my-rig-witness requesting shutdown", "fallback", "my-rig-witness"},
{"LIFECYCLE: action", `{"action": "cycle"}`, "mayor", "mayor"},
{"LIFECYCLE: action", "restart", "gastown-witness", "gastown-witness"},
{"lifecycle: action", "shutdown", "my-rig-refinery", "my-rig-refinery"},
}
for _, tc := range tests {
msg := &BeadsMessage{
Subject: tc.title,
Subject: tc.subject,
Body: tc.body,
From: tc.sender,
}
result := d.parseLifecycleRequest(msg)
if result == nil {
t.Errorf("parseLifecycleRequest(%q) returned nil", tc.title)
t.Errorf("parseLifecycleRequest(body=%q) returned nil", tc.body)
continue
}
if result.From != tc.expectedFrom {
t.Errorf("parseLifecycleRequest(%q) from = %q, expected %q", tc.title, result.From, tc.expectedFrom)
t.Errorf("parseLifecycleRequest() from = %q, expected %q", result.From, tc.expectedFrom)
}
}
}
func TestParseLifecycleRequest_FallsBackToSender(t *testing.T) {
func TestParseLifecycleRequest_AlwaysUsesFromField(t *testing.T) {
d := testDaemon()
// When the title doesn't contain a parseable "from", use sender
// With structured body parsing, From always comes from message From field
msg := &BeadsMessage{
Subject: "LIFECYCLE: requesting cycle", // no role before "requesting"
From: "fallback-sender",
Subject: "LIFECYCLE: action",
Body: "cycle",
From: "the-sender",
}
result := d.parseLifecycleRequest(msg)
if result == nil {
t.Fatal("expected non-nil result")
}
// The "from" should be empty string from title parsing, then fallback to sender
if result.From != "fallback-sender" && result.From != "" {
// Note: the actual behavior may just be empty string if parsing gives nothing
// Let's check what actually happens
t.Logf("parseLifecycleRequest fallback: from=%q", result.From)
if result.From != "the-sender" {
t.Errorf("parseLifecycleRequest() from = %q, expected 'the-sender'", result.From)
}
}