fix: Add grace period to prevent Deacon restart loop (#590)
* fix(daemon): prevent runaway refinery session spawning Fixes #566 The daemon spawned 812 refinery sessions over 4 days because: 1. Zombie detection was too strict - used IsAgentRunning(session, "node") but Claude reports pane command as version number (e.g., "2.1.7"), causing healthy sessions to be killed and recreated every heartbeat. 2. daemon.json patrol config was completely ignored - the daemon never loaded or checked the enabled flags. Changes: - refinery/manager.go: Use IsClaudeRunning() instead of IsAgentRunning() for robust Claude detection (handles "node", "claude", version patterns) - daemon/types.go: Add PatrolConfig types and LoadPatrolConfig() to read mayor/daemon.json - daemon/daemon.go: Load patrol config at startup, check enabled flags before calling ensureRefineriesRunning/ensureWitnessesRunning, add diagnostic logging for "already running" cases Tested: Verified over multiple heartbeats that refinery shows "already running, skipping spawn" instead of spawning new sessions. * fix: Add grace period to prevent Deacon restart loop The daemon had a race condition where: 1. ensureDeaconRunning() starts a new Deacon session 2. checkDeaconHeartbeat() runs in same heartbeat cycle 3. Heartbeat file is stale (from before crash) 4. Session is immediately killed 5. Infinite restart loop every 3 minutes Fix: - Track when Deacon was last started (deaconLastStarted field) - Skip heartbeat check during 5-minute grace period - Add config support for Deacon (consistency with refinery/witness) After grace period, normal heartbeat checking resumes. Genuinely stuck sessions (no heartbeat update after 5+ min) are still detected. Fixes #589 --------- Co-authored-by: mayor <your-github-email@example.com>
This commit is contained in:
@@ -37,17 +37,24 @@ import (
|
|||||||
// This is recovery-focused: normal wake is handled by feed subscription (bd activity --follow).
|
// This is recovery-focused: normal wake is handled by feed subscription (bd activity --follow).
|
||||||
// The daemon is the safety net for dead sessions, GUPP violations, and orphaned work.
|
// The daemon is the safety net for dead sessions, GUPP violations, and orphaned work.
|
||||||
type Daemon struct {
|
type Daemon struct {
|
||||||
config *Config
|
config *Config
|
||||||
tmux *tmux.Tmux
|
patrolConfig *DaemonPatrolConfig
|
||||||
logger *log.Logger
|
tmux *tmux.Tmux
|
||||||
ctx context.Context
|
logger *log.Logger
|
||||||
cancel context.CancelFunc
|
ctx context.Context
|
||||||
curator *feed.Curator
|
cancel context.CancelFunc
|
||||||
|
curator *feed.Curator
|
||||||
convoyWatcher *ConvoyWatcher
|
convoyWatcher *ConvoyWatcher
|
||||||
|
|
||||||
// Mass death detection: track recent session deaths
|
// Mass death detection: track recent session deaths
|
||||||
deathsMu sync.Mutex
|
deathsMu sync.Mutex
|
||||||
recentDeaths []sessionDeath
|
recentDeaths []sessionDeath
|
||||||
|
|
||||||
|
// Deacon startup tracking: prevents race condition where newly started
|
||||||
|
// sessions are immediately killed by the heartbeat check.
|
||||||
|
// See: https://github.com/steveyegge/gastown/issues/567
|
||||||
|
// Note: Only accessed from heartbeat loop goroutine - no sync needed.
|
||||||
|
deaconLastStarted time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
// sessionDeath records a detected session death for mass death analysis.
|
// sessionDeath records a detected session death for mass death analysis.
|
||||||
@@ -79,12 +86,19 @@ func New(config *Config) (*Daemon, error) {
|
|||||||
logger := log.New(logFile, "", log.LstdFlags)
|
logger := log.New(logFile, "", log.LstdFlags)
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
|
||||||
|
// Load patrol config from mayor/daemon.json (optional - nil if missing)
|
||||||
|
patrolConfig := LoadPatrolConfig(config.TownRoot)
|
||||||
|
if patrolConfig != nil {
|
||||||
|
logger.Printf("Loaded patrol config from %s", PatrolConfigFile(config.TownRoot))
|
||||||
|
}
|
||||||
|
|
||||||
return &Daemon{
|
return &Daemon{
|
||||||
config: config,
|
config: config,
|
||||||
tmux: tmux.NewTmux(),
|
patrolConfig: patrolConfig,
|
||||||
logger: logger,
|
tmux: tmux.NewTmux(),
|
||||||
ctx: ctx,
|
logger: logger,
|
||||||
cancel: cancel,
|
ctx: ctx,
|
||||||
|
cancel: cancel,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,21 +211,42 @@ func (d *Daemon) heartbeat(state *State) {
|
|||||||
d.logger.Println("Heartbeat starting (recovery-focused)")
|
d.logger.Println("Heartbeat starting (recovery-focused)")
|
||||||
|
|
||||||
// 1. Ensure Deacon is running (restart if dead)
|
// 1. Ensure Deacon is running (restart if dead)
|
||||||
d.ensureDeaconRunning()
|
// Check patrol config - can be disabled in mayor/daemon.json
|
||||||
|
if IsPatrolEnabled(d.patrolConfig, "deacon") {
|
||||||
|
d.ensureDeaconRunning()
|
||||||
|
} else {
|
||||||
|
d.logger.Printf("Deacon patrol disabled in config, skipping")
|
||||||
|
}
|
||||||
|
|
||||||
// 2. Poke Boot for intelligent triage (stuck/nudge/interrupt)
|
// 2. Poke Boot for intelligent triage (stuck/nudge/interrupt)
|
||||||
// Boot handles nuanced "is Deacon responsive" decisions
|
// Boot handles nuanced "is Deacon responsive" decisions
|
||||||
d.ensureBootRunning()
|
// Only run if Deacon patrol is enabled
|
||||||
|
if IsPatrolEnabled(d.patrolConfig, "deacon") {
|
||||||
|
d.ensureBootRunning()
|
||||||
|
}
|
||||||
|
|
||||||
// 3. Direct Deacon heartbeat check (belt-and-suspenders)
|
// 3. Direct Deacon heartbeat check (belt-and-suspenders)
|
||||||
// Boot may not detect all stuck states; this provides a fallback
|
// Boot may not detect all stuck states; this provides a fallback
|
||||||
d.checkDeaconHeartbeat()
|
// Only run if Deacon patrol is enabled
|
||||||
|
if IsPatrolEnabled(d.patrolConfig, "deacon") {
|
||||||
|
d.checkDeaconHeartbeat()
|
||||||
|
}
|
||||||
|
|
||||||
// 4. Ensure Witnesses are running for all rigs (restart if dead)
|
// 4. Ensure Witnesses are running for all rigs (restart if dead)
|
||||||
d.ensureWitnessesRunning()
|
// Check patrol config - can be disabled in mayor/daemon.json
|
||||||
|
if IsPatrolEnabled(d.patrolConfig, "witness") {
|
||||||
|
d.ensureWitnessesRunning()
|
||||||
|
} else {
|
||||||
|
d.logger.Printf("Witness patrol disabled in config, skipping")
|
||||||
|
}
|
||||||
|
|
||||||
// 5. Ensure Refineries are running for all rigs (restart if dead)
|
// 5. Ensure Refineries are running for all rigs (restart if dead)
|
||||||
d.ensureRefineriesRunning()
|
// Check patrol config - can be disabled in mayor/daemon.json
|
||||||
|
if IsPatrolEnabled(d.patrolConfig, "refinery") {
|
||||||
|
d.ensureRefineriesRunning()
|
||||||
|
} else {
|
||||||
|
d.logger.Printf("Refinery patrol disabled in config, skipping")
|
||||||
|
}
|
||||||
|
|
||||||
// 6. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
|
// 6. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
|
||||||
// This ensures polecats get nudged even when Deacon isn't in a patrol cycle.
|
// This ensures polecats get nudged even when Deacon isn't in a patrol cycle.
|
||||||
@@ -331,13 +366,31 @@ func (d *Daemon) ensureDeaconRunning() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Track when we started the Deacon to prevent race condition in checkDeaconHeartbeat.
|
||||||
|
// The heartbeat file will still be stale until the Deacon runs a full patrol cycle.
|
||||||
|
d.deaconLastStarted = time.Now()
|
||||||
d.logger.Println("Deacon started successfully")
|
d.logger.Println("Deacon started successfully")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// deaconGracePeriod is the time to wait after starting a Deacon before checking heartbeat.
|
||||||
|
// The Deacon needs time to initialize Claude, run SessionStart hooks, execute gt prime,
|
||||||
|
// run a patrol cycle, and write a fresh heartbeat. 5 minutes is conservative.
|
||||||
|
const deaconGracePeriod = 5 * time.Minute
|
||||||
|
|
||||||
// checkDeaconHeartbeat checks if the Deacon is making progress.
|
// checkDeaconHeartbeat checks if the Deacon is making progress.
|
||||||
// This is a belt-and-suspenders fallback in case Boot doesn't detect stuck states.
|
// This is a belt-and-suspenders fallback in case Boot doesn't detect stuck states.
|
||||||
// Uses the heartbeat file that the Deacon updates on each patrol cycle.
|
// Uses the heartbeat file that the Deacon updates on each patrol cycle.
|
||||||
func (d *Daemon) checkDeaconHeartbeat() {
|
func (d *Daemon) checkDeaconHeartbeat() {
|
||||||
|
// Grace period: don't check heartbeat for newly started sessions.
|
||||||
|
// This prevents the race condition where we start a Deacon, then immediately
|
||||||
|
// see a stale heartbeat (from before the crash) and kill the session we just started.
|
||||||
|
// See: https://github.com/steveyegge/gastown/issues/567
|
||||||
|
if !d.deaconLastStarted.IsZero() && time.Since(d.deaconLastStarted) < deaconGracePeriod {
|
||||||
|
d.logger.Printf("Deacon started recently (%s ago), skipping heartbeat check",
|
||||||
|
time.Since(d.deaconLastStarted).Round(time.Second))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
hb := deacon.ReadHeartbeat(d.config.TownRoot)
|
hb := deacon.ReadHeartbeat(d.config.TownRoot)
|
||||||
if hb == nil {
|
if hb == nil {
|
||||||
// No heartbeat file - Deacon hasn't started a cycle yet
|
// No heartbeat file - Deacon hasn't started a cycle yet
|
||||||
@@ -415,7 +468,8 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
|||||||
|
|
||||||
if err := mgr.Start(false, "", nil); err != nil {
|
if err := mgr.Start(false, "", nil); err != nil {
|
||||||
if err == witness.ErrAlreadyRunning {
|
if err == witness.ErrAlreadyRunning {
|
||||||
// Already running - nothing to do
|
// Already running - this is the expected case
|
||||||
|
d.logger.Printf("Witness for %s already running, skipping spawn", rigName)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
d.logger.Printf("Error starting witness for %s: %v", rigName, err)
|
d.logger.Printf("Error starting witness for %s: %v", rigName, err)
|
||||||
@@ -454,7 +508,8 @@ func (d *Daemon) ensureRefineryRunning(rigName string) {
|
|||||||
|
|
||||||
if err := mgr.Start(false, ""); err != nil {
|
if err := mgr.Start(false, ""); err != nil {
|
||||||
if err == refinery.ErrAlreadyRunning {
|
if err == refinery.ErrAlreadyRunning {
|
||||||
// Already running - nothing to do
|
// Already running - this is the expected case when fix is working
|
||||||
|
d.logger.Printf("Refinery for %s already running, skipping spawn", rigName)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
d.logger.Printf("Error starting refinery for %s: %v", rigName, err)
|
d.logger.Printf("Error starting refinery for %s: %v", rigName, err)
|
||||||
|
|||||||
53
internal/daemon/patrol_config_test.go
Normal file
53
internal/daemon/patrol_config_test.go
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
package daemon
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLoadPatrolConfig(t *testing.T) {
|
||||||
|
// Create a temp dir with test config
|
||||||
|
tmpDir := t.TempDir()
|
||||||
|
mayorDir := filepath.Join(tmpDir, "mayor")
|
||||||
|
if err := os.MkdirAll(mayorDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write test config
|
||||||
|
configJSON := `{
|
||||||
|
"type": "daemon-patrol-config",
|
||||||
|
"version": 1,
|
||||||
|
"patrols": {
|
||||||
|
"refinery": {"enabled": false},
|
||||||
|
"witness": {"enabled": true}
|
||||||
|
}
|
||||||
|
}`
|
||||||
|
if err := os.WriteFile(filepath.Join(mayorDir, "daemon.json"), []byte(configJSON), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load config
|
||||||
|
config := LoadPatrolConfig(tmpDir)
|
||||||
|
if config == nil {
|
||||||
|
t.Fatal("expected config to be loaded")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test enabled flags
|
||||||
|
if IsPatrolEnabled(config, "refinery") {
|
||||||
|
t.Error("expected refinery to be disabled")
|
||||||
|
}
|
||||||
|
if !IsPatrolEnabled(config, "witness") {
|
||||||
|
t.Error("expected witness to be enabled")
|
||||||
|
}
|
||||||
|
if !IsPatrolEnabled(config, "deacon") {
|
||||||
|
t.Error("expected deacon to be enabled (default)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsPatrolEnabled_NilConfig(t *testing.T) {
|
||||||
|
// Should default to enabled when config is nil
|
||||||
|
if !IsPatrolEnabled(nil, "refinery") {
|
||||||
|
t.Error("expected default to be enabled")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -96,6 +96,78 @@ func SaveState(townRoot string, state *State) error {
|
|||||||
return util.AtomicWriteJSON(stateFile, state)
|
return util.AtomicWriteJSON(stateFile, state)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PatrolConfig holds configuration for a single patrol.
|
||||||
|
type PatrolConfig struct {
|
||||||
|
// Enabled controls whether this patrol runs during heartbeat.
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
|
||||||
|
// Interval is how often to run this patrol (not used yet).
|
||||||
|
Interval string `json:"interval,omitempty"`
|
||||||
|
|
||||||
|
// Agent is the agent type for this patrol (not used yet).
|
||||||
|
Agent string `json:"agent,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// PatrolsConfig holds configuration for all patrols.
|
||||||
|
type PatrolsConfig struct {
|
||||||
|
Refinery *PatrolConfig `json:"refinery,omitempty"`
|
||||||
|
Witness *PatrolConfig `json:"witness,omitempty"`
|
||||||
|
Deacon *PatrolConfig `json:"deacon,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// DaemonPatrolConfig is the structure of mayor/daemon.json.
|
||||||
|
type DaemonPatrolConfig struct {
|
||||||
|
Type string `json:"type"`
|
||||||
|
Version int `json:"version"`
|
||||||
|
Heartbeat *PatrolConfig `json:"heartbeat,omitempty"`
|
||||||
|
Patrols *PatrolsConfig `json:"patrols,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// PatrolConfigFile returns the path to the patrol config file.
|
||||||
|
func PatrolConfigFile(townRoot string) string {
|
||||||
|
return filepath.Join(townRoot, "mayor", "daemon.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadPatrolConfig loads patrol configuration from mayor/daemon.json.
|
||||||
|
// Returns nil if the file doesn't exist or can't be parsed.
|
||||||
|
func LoadPatrolConfig(townRoot string) *DaemonPatrolConfig {
|
||||||
|
configFile := PatrolConfigFile(townRoot)
|
||||||
|
data, err := os.ReadFile(configFile)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var config DaemonPatrolConfig
|
||||||
|
if err := json.Unmarshal(data, &config); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &config
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsPatrolEnabled checks if a patrol is enabled in the config.
|
||||||
|
// Returns true if the config doesn't exist (default enabled for backwards compatibility).
|
||||||
|
func IsPatrolEnabled(config *DaemonPatrolConfig, patrol string) bool {
|
||||||
|
if config == nil || config.Patrols == nil {
|
||||||
|
return true // Default: enabled
|
||||||
|
}
|
||||||
|
|
||||||
|
switch patrol {
|
||||||
|
case "refinery":
|
||||||
|
if config.Patrols.Refinery != nil {
|
||||||
|
return config.Patrols.Refinery.Enabled
|
||||||
|
}
|
||||||
|
case "witness":
|
||||||
|
if config.Patrols.Witness != nil {
|
||||||
|
return config.Patrols.Witness.Enabled
|
||||||
|
}
|
||||||
|
case "deacon":
|
||||||
|
if config.Patrols.Deacon != nil {
|
||||||
|
return config.Patrols.Deacon.Enabled
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true // Default: enabled
|
||||||
|
}
|
||||||
|
|
||||||
// LifecycleAction represents a lifecycle request action.
|
// LifecycleAction represents a lifecycle request action.
|
||||||
type LifecycleAction string
|
type LifecycleAction string
|
||||||
|
|
||||||
|
|||||||
@@ -115,9 +115,8 @@ func (m *Manager) Start(foreground bool, agentOverride string) error {
|
|||||||
|
|
||||||
if foreground {
|
if foreground {
|
||||||
// In foreground mode, check tmux session (no PID inference per ZFC)
|
// In foreground mode, check tmux session (no PID inference per ZFC)
|
||||||
townRoot := filepath.Dir(m.rig.Path)
|
// Use IsClaudeRunning for robust detection (see gastown#566)
|
||||||
agentCfg := config.ResolveRoleAgentConfig(constants.RoleRefinery, townRoot, m.rig.Path)
|
if running, _ := t.HasSession(sessionID); running && t.IsClaudeRunning(sessionID) {
|
||||||
if running, _ := t.HasSession(sessionID); running && t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
|
|
||||||
return ErrAlreadyRunning
|
return ErrAlreadyRunning
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,14 +137,15 @@ func (m *Manager) Start(foreground bool, agentOverride string) error {
|
|||||||
// Background mode: check if session already exists
|
// Background mode: check if session already exists
|
||||||
running, _ := t.HasSession(sessionID)
|
running, _ := t.HasSession(sessionID)
|
||||||
if running {
|
if running {
|
||||||
// Session exists - check if agent is actually running (healthy vs zombie)
|
// Session exists - check if Claude is actually running (healthy vs zombie)
|
||||||
townRoot := filepath.Dir(m.rig.Path)
|
// Use IsClaudeRunning for robust detection: Claude can report as "node", "claude",
|
||||||
agentCfg := config.ResolveRoleAgentConfig(constants.RoleRefinery, townRoot, m.rig.Path)
|
// or version number like "2.0.76". IsAgentRunning with just "node" was too strict
|
||||||
if t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
|
// and caused healthy sessions to be killed. See: gastown#566
|
||||||
// Healthy - agent is running
|
if t.IsClaudeRunning(sessionID) {
|
||||||
|
// Healthy - Claude is running
|
||||||
return ErrAlreadyRunning
|
return ErrAlreadyRunning
|
||||||
}
|
}
|
||||||
// Zombie - tmux alive but agent dead. Kill and recreate.
|
// Zombie - tmux alive but Claude dead. Kill and recreate.
|
||||||
_, _ = fmt.Fprintln(m.output, "⚠ Detected zombie session (tmux alive, agent dead). Recreating...")
|
_, _ = fmt.Fprintln(m.output, "⚠ Detected zombie session (tmux alive, agent dead). Recreating...")
|
||||||
if err := t.KillSession(sessionID); err != nil {
|
if err := t.KillSession(sessionID); err != nil {
|
||||||
return fmt.Errorf("killing zombie session: %w", err)
|
return fmt.Errorf("killing zombie session: %w", err)
|
||||||
|
|||||||
Reference in New Issue
Block a user