diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 2880531f..a498a780 100755 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -37,17 +37,24 @@ import ( // This is recovery-focused: normal wake is handled by feed subscription (bd activity --follow). // The daemon is the safety net for dead sessions, GUPP violations, and orphaned work. type Daemon struct { - config *Config - tmux *tmux.Tmux - logger *log.Logger - ctx context.Context - cancel context.CancelFunc - curator *feed.Curator + config *Config + patrolConfig *DaemonPatrolConfig + tmux *tmux.Tmux + logger *log.Logger + ctx context.Context + cancel context.CancelFunc + curator *feed.Curator convoyWatcher *ConvoyWatcher // Mass death detection: track recent session deaths deathsMu sync.Mutex recentDeaths []sessionDeath + + // Deacon startup tracking: prevents race condition where newly started + // sessions are immediately killed by the heartbeat check. + // See: https://github.com/steveyegge/gastown/issues/567 + // Note: Only accessed from heartbeat loop goroutine - no sync needed. + deaconLastStarted time.Time } // sessionDeath records a detected session death for mass death analysis. @@ -79,12 +86,19 @@ func New(config *Config) (*Daemon, error) { logger := log.New(logFile, "", log.LstdFlags) ctx, cancel := context.WithCancel(context.Background()) + // Load patrol config from mayor/daemon.json (optional - nil if missing) + patrolConfig := LoadPatrolConfig(config.TownRoot) + if patrolConfig != nil { + logger.Printf("Loaded patrol config from %s", PatrolConfigFile(config.TownRoot)) + } + return &Daemon{ - config: config, - tmux: tmux.NewTmux(), - logger: logger, - ctx: ctx, - cancel: cancel, + config: config, + patrolConfig: patrolConfig, + tmux: tmux.NewTmux(), + logger: logger, + ctx: ctx, + cancel: cancel, }, nil } @@ -197,21 +211,42 @@ func (d *Daemon) heartbeat(state *State) { d.logger.Println("Heartbeat starting (recovery-focused)") // 1. Ensure Deacon is running (restart if dead) - d.ensureDeaconRunning() + // Check patrol config - can be disabled in mayor/daemon.json + if IsPatrolEnabled(d.patrolConfig, "deacon") { + d.ensureDeaconRunning() + } else { + d.logger.Printf("Deacon patrol disabled in config, skipping") + } // 2. Poke Boot for intelligent triage (stuck/nudge/interrupt) // Boot handles nuanced "is Deacon responsive" decisions - d.ensureBootRunning() + // Only run if Deacon patrol is enabled + if IsPatrolEnabled(d.patrolConfig, "deacon") { + d.ensureBootRunning() + } // 3. Direct Deacon heartbeat check (belt-and-suspenders) // Boot may not detect all stuck states; this provides a fallback - d.checkDeaconHeartbeat() + // Only run if Deacon patrol is enabled + if IsPatrolEnabled(d.patrolConfig, "deacon") { + d.checkDeaconHeartbeat() + } // 4. Ensure Witnesses are running for all rigs (restart if dead) - d.ensureWitnessesRunning() + // Check patrol config - can be disabled in mayor/daemon.json + if IsPatrolEnabled(d.patrolConfig, "witness") { + d.ensureWitnessesRunning() + } else { + d.logger.Printf("Witness patrol disabled in config, skipping") + } // 5. Ensure Refineries are running for all rigs (restart if dead) - d.ensureRefineriesRunning() + // Check patrol config - can be disabled in mayor/daemon.json + if IsPatrolEnabled(d.patrolConfig, "refinery") { + d.ensureRefineriesRunning() + } else { + d.logger.Printf("Refinery patrol disabled in config, skipping") + } // 6. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable) // This ensures polecats get nudged even when Deacon isn't in a patrol cycle. @@ -331,13 +366,31 @@ func (d *Daemon) ensureDeaconRunning() { return } + // Track when we started the Deacon to prevent race condition in checkDeaconHeartbeat. + // The heartbeat file will still be stale until the Deacon runs a full patrol cycle. + d.deaconLastStarted = time.Now() d.logger.Println("Deacon started successfully") } +// deaconGracePeriod is the time to wait after starting a Deacon before checking heartbeat. +// The Deacon needs time to initialize Claude, run SessionStart hooks, execute gt prime, +// run a patrol cycle, and write a fresh heartbeat. 5 minutes is conservative. +const deaconGracePeriod = 5 * time.Minute + // checkDeaconHeartbeat checks if the Deacon is making progress. // This is a belt-and-suspenders fallback in case Boot doesn't detect stuck states. // Uses the heartbeat file that the Deacon updates on each patrol cycle. func (d *Daemon) checkDeaconHeartbeat() { + // Grace period: don't check heartbeat for newly started sessions. + // This prevents the race condition where we start a Deacon, then immediately + // see a stale heartbeat (from before the crash) and kill the session we just started. + // See: https://github.com/steveyegge/gastown/issues/567 + if !d.deaconLastStarted.IsZero() && time.Since(d.deaconLastStarted) < deaconGracePeriod { + d.logger.Printf("Deacon started recently (%s ago), skipping heartbeat check", + time.Since(d.deaconLastStarted).Round(time.Second)) + return + } + hb := deacon.ReadHeartbeat(d.config.TownRoot) if hb == nil { // No heartbeat file - Deacon hasn't started a cycle yet @@ -415,7 +468,8 @@ func (d *Daemon) ensureWitnessRunning(rigName string) { if err := mgr.Start(false, "", nil); err != nil { if err == witness.ErrAlreadyRunning { - // Already running - nothing to do + // Already running - this is the expected case + d.logger.Printf("Witness for %s already running, skipping spawn", rigName) return } d.logger.Printf("Error starting witness for %s: %v", rigName, err) @@ -454,7 +508,8 @@ func (d *Daemon) ensureRefineryRunning(rigName string) { if err := mgr.Start(false, ""); err != nil { if err == refinery.ErrAlreadyRunning { - // Already running - nothing to do + // Already running - this is the expected case when fix is working + d.logger.Printf("Refinery for %s already running, skipping spawn", rigName) return } d.logger.Printf("Error starting refinery for %s: %v", rigName, err) diff --git a/internal/daemon/patrol_config_test.go b/internal/daemon/patrol_config_test.go new file mode 100644 index 00000000..f8d4f5fb --- /dev/null +++ b/internal/daemon/patrol_config_test.go @@ -0,0 +1,53 @@ +package daemon + +import ( + "os" + "path/filepath" + "testing" +) + +func TestLoadPatrolConfig(t *testing.T) { + // Create a temp dir with test config + tmpDir := t.TempDir() + mayorDir := filepath.Join(tmpDir, "mayor") + if err := os.MkdirAll(mayorDir, 0755); err != nil { + t.Fatal(err) + } + + // Write test config + configJSON := `{ + "type": "daemon-patrol-config", + "version": 1, + "patrols": { + "refinery": {"enabled": false}, + "witness": {"enabled": true} + } + }` + if err := os.WriteFile(filepath.Join(mayorDir, "daemon.json"), []byte(configJSON), 0644); err != nil { + t.Fatal(err) + } + + // Load config + config := LoadPatrolConfig(tmpDir) + if config == nil { + t.Fatal("expected config to be loaded") + } + + // Test enabled flags + if IsPatrolEnabled(config, "refinery") { + t.Error("expected refinery to be disabled") + } + if !IsPatrolEnabled(config, "witness") { + t.Error("expected witness to be enabled") + } + if !IsPatrolEnabled(config, "deacon") { + t.Error("expected deacon to be enabled (default)") + } +} + +func TestIsPatrolEnabled_NilConfig(t *testing.T) { + // Should default to enabled when config is nil + if !IsPatrolEnabled(nil, "refinery") { + t.Error("expected default to be enabled") + } +} diff --git a/internal/daemon/types.go b/internal/daemon/types.go index 62ab8c5d..df1b1564 100644 --- a/internal/daemon/types.go +++ b/internal/daemon/types.go @@ -96,6 +96,78 @@ func SaveState(townRoot string, state *State) error { return util.AtomicWriteJSON(stateFile, state) } +// PatrolConfig holds configuration for a single patrol. +type PatrolConfig struct { + // Enabled controls whether this patrol runs during heartbeat. + Enabled bool `json:"enabled"` + + // Interval is how often to run this patrol (not used yet). + Interval string `json:"interval,omitempty"` + + // Agent is the agent type for this patrol (not used yet). + Agent string `json:"agent,omitempty"` +} + +// PatrolsConfig holds configuration for all patrols. +type PatrolsConfig struct { + Refinery *PatrolConfig `json:"refinery,omitempty"` + Witness *PatrolConfig `json:"witness,omitempty"` + Deacon *PatrolConfig `json:"deacon,omitempty"` +} + +// DaemonPatrolConfig is the structure of mayor/daemon.json. +type DaemonPatrolConfig struct { + Type string `json:"type"` + Version int `json:"version"` + Heartbeat *PatrolConfig `json:"heartbeat,omitempty"` + Patrols *PatrolsConfig `json:"patrols,omitempty"` +} + +// PatrolConfigFile returns the path to the patrol config file. +func PatrolConfigFile(townRoot string) string { + return filepath.Join(townRoot, "mayor", "daemon.json") +} + +// LoadPatrolConfig loads patrol configuration from mayor/daemon.json. +// Returns nil if the file doesn't exist or can't be parsed. +func LoadPatrolConfig(townRoot string) *DaemonPatrolConfig { + configFile := PatrolConfigFile(townRoot) + data, err := os.ReadFile(configFile) + if err != nil { + return nil + } + + var config DaemonPatrolConfig + if err := json.Unmarshal(data, &config); err != nil { + return nil + } + return &config +} + +// IsPatrolEnabled checks if a patrol is enabled in the config. +// Returns true if the config doesn't exist (default enabled for backwards compatibility). +func IsPatrolEnabled(config *DaemonPatrolConfig, patrol string) bool { + if config == nil || config.Patrols == nil { + return true // Default: enabled + } + + switch patrol { + case "refinery": + if config.Patrols.Refinery != nil { + return config.Patrols.Refinery.Enabled + } + case "witness": + if config.Patrols.Witness != nil { + return config.Patrols.Witness.Enabled + } + case "deacon": + if config.Patrols.Deacon != nil { + return config.Patrols.Deacon.Enabled + } + } + return true // Default: enabled +} + // LifecycleAction represents a lifecycle request action. type LifecycleAction string diff --git a/internal/refinery/manager.go b/internal/refinery/manager.go index e8868dc2..41d539e8 100644 --- a/internal/refinery/manager.go +++ b/internal/refinery/manager.go @@ -115,9 +115,8 @@ func (m *Manager) Start(foreground bool, agentOverride string) error { if foreground { // In foreground mode, check tmux session (no PID inference per ZFC) - townRoot := filepath.Dir(m.rig.Path) - agentCfg := config.ResolveRoleAgentConfig(constants.RoleRefinery, townRoot, m.rig.Path) - if running, _ := t.HasSession(sessionID); running && t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) { + // Use IsClaudeRunning for robust detection (see gastown#566) + if running, _ := t.HasSession(sessionID); running && t.IsClaudeRunning(sessionID) { return ErrAlreadyRunning } @@ -138,14 +137,15 @@ func (m *Manager) Start(foreground bool, agentOverride string) error { // Background mode: check if session already exists running, _ := t.HasSession(sessionID) if running { - // Session exists - check if agent is actually running (healthy vs zombie) - townRoot := filepath.Dir(m.rig.Path) - agentCfg := config.ResolveRoleAgentConfig(constants.RoleRefinery, townRoot, m.rig.Path) - if t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) { - // Healthy - agent is running + // Session exists - check if Claude is actually running (healthy vs zombie) + // Use IsClaudeRunning for robust detection: Claude can report as "node", "claude", + // or version number like "2.0.76". IsAgentRunning with just "node" was too strict + // and caused healthy sessions to be killed. See: gastown#566 + if t.IsClaudeRunning(sessionID) { + // Healthy - Claude is running return ErrAlreadyRunning } - // Zombie - tmux alive but agent dead. Kill and recreate. + // Zombie - tmux alive but Claude dead. Kill and recreate. _, _ = fmt.Fprintln(m.output, "⚠ Detected zombie session (tmux alive, agent dead). Recreating...") if err := t.KillSession(sessionID); err != nil { return fmt.Errorf("killing zombie session: %w", err)