fix: Add grace period to prevent Deacon restart loop (#590)
* fix(daemon): prevent runaway refinery session spawning Fixes #566 The daemon spawned 812 refinery sessions over 4 days because: 1. Zombie detection was too strict - used IsAgentRunning(session, "node") but Claude reports pane command as version number (e.g., "2.1.7"), causing healthy sessions to be killed and recreated every heartbeat. 2. daemon.json patrol config was completely ignored - the daemon never loaded or checked the enabled flags. Changes: - refinery/manager.go: Use IsClaudeRunning() instead of IsAgentRunning() for robust Claude detection (handles "node", "claude", version patterns) - daemon/types.go: Add PatrolConfig types and LoadPatrolConfig() to read mayor/daemon.json - daemon/daemon.go: Load patrol config at startup, check enabled flags before calling ensureRefineriesRunning/ensureWitnessesRunning, add diagnostic logging for "already running" cases Tested: Verified over multiple heartbeats that refinery shows "already running, skipping spawn" instead of spawning new sessions. * fix: Add grace period to prevent Deacon restart loop The daemon had a race condition where: 1. ensureDeaconRunning() starts a new Deacon session 2. checkDeaconHeartbeat() runs in same heartbeat cycle 3. Heartbeat file is stale (from before crash) 4. Session is immediately killed 5. Infinite restart loop every 3 minutes Fix: - Track when Deacon was last started (deaconLastStarted field) - Skip heartbeat check during 5-minute grace period - Add config support for Deacon (consistency with refinery/witness) After grace period, normal heartbeat checking resumes. Genuinely stuck sessions (no heartbeat update after 5+ min) are still detected. Fixes #589 --------- Co-authored-by: mayor <your-github-email@example.com>
This commit is contained in:
@@ -37,17 +37,24 @@ import (
|
||||
// This is recovery-focused: normal wake is handled by feed subscription (bd activity --follow).
|
||||
// The daemon is the safety net for dead sessions, GUPP violations, and orphaned work.
|
||||
type Daemon struct {
|
||||
config *Config
|
||||
tmux *tmux.Tmux
|
||||
logger *log.Logger
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
curator *feed.Curator
|
||||
config *Config
|
||||
patrolConfig *DaemonPatrolConfig
|
||||
tmux *tmux.Tmux
|
||||
logger *log.Logger
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
curator *feed.Curator
|
||||
convoyWatcher *ConvoyWatcher
|
||||
|
||||
// Mass death detection: track recent session deaths
|
||||
deathsMu sync.Mutex
|
||||
recentDeaths []sessionDeath
|
||||
|
||||
// Deacon startup tracking: prevents race condition where newly started
|
||||
// sessions are immediately killed by the heartbeat check.
|
||||
// See: https://github.com/steveyegge/gastown/issues/567
|
||||
// Note: Only accessed from heartbeat loop goroutine - no sync needed.
|
||||
deaconLastStarted time.Time
|
||||
}
|
||||
|
||||
// sessionDeath records a detected session death for mass death analysis.
|
||||
@@ -79,12 +86,19 @@ func New(config *Config) (*Daemon, error) {
|
||||
logger := log.New(logFile, "", log.LstdFlags)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// Load patrol config from mayor/daemon.json (optional - nil if missing)
|
||||
patrolConfig := LoadPatrolConfig(config.TownRoot)
|
||||
if patrolConfig != nil {
|
||||
logger.Printf("Loaded patrol config from %s", PatrolConfigFile(config.TownRoot))
|
||||
}
|
||||
|
||||
return &Daemon{
|
||||
config: config,
|
||||
tmux: tmux.NewTmux(),
|
||||
logger: logger,
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
config: config,
|
||||
patrolConfig: patrolConfig,
|
||||
tmux: tmux.NewTmux(),
|
||||
logger: logger,
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -197,21 +211,42 @@ func (d *Daemon) heartbeat(state *State) {
|
||||
d.logger.Println("Heartbeat starting (recovery-focused)")
|
||||
|
||||
// 1. Ensure Deacon is running (restart if dead)
|
||||
d.ensureDeaconRunning()
|
||||
// Check patrol config - can be disabled in mayor/daemon.json
|
||||
if IsPatrolEnabled(d.patrolConfig, "deacon") {
|
||||
d.ensureDeaconRunning()
|
||||
} else {
|
||||
d.logger.Printf("Deacon patrol disabled in config, skipping")
|
||||
}
|
||||
|
||||
// 2. Poke Boot for intelligent triage (stuck/nudge/interrupt)
|
||||
// Boot handles nuanced "is Deacon responsive" decisions
|
||||
d.ensureBootRunning()
|
||||
// Only run if Deacon patrol is enabled
|
||||
if IsPatrolEnabled(d.patrolConfig, "deacon") {
|
||||
d.ensureBootRunning()
|
||||
}
|
||||
|
||||
// 3. Direct Deacon heartbeat check (belt-and-suspenders)
|
||||
// Boot may not detect all stuck states; this provides a fallback
|
||||
d.checkDeaconHeartbeat()
|
||||
// Only run if Deacon patrol is enabled
|
||||
if IsPatrolEnabled(d.patrolConfig, "deacon") {
|
||||
d.checkDeaconHeartbeat()
|
||||
}
|
||||
|
||||
// 4. Ensure Witnesses are running for all rigs (restart if dead)
|
||||
d.ensureWitnessesRunning()
|
||||
// Check patrol config - can be disabled in mayor/daemon.json
|
||||
if IsPatrolEnabled(d.patrolConfig, "witness") {
|
||||
d.ensureWitnessesRunning()
|
||||
} else {
|
||||
d.logger.Printf("Witness patrol disabled in config, skipping")
|
||||
}
|
||||
|
||||
// 5. Ensure Refineries are running for all rigs (restart if dead)
|
||||
d.ensureRefineriesRunning()
|
||||
// Check patrol config - can be disabled in mayor/daemon.json
|
||||
if IsPatrolEnabled(d.patrolConfig, "refinery") {
|
||||
d.ensureRefineriesRunning()
|
||||
} else {
|
||||
d.logger.Printf("Refinery patrol disabled in config, skipping")
|
||||
}
|
||||
|
||||
// 6. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
|
||||
// This ensures polecats get nudged even when Deacon isn't in a patrol cycle.
|
||||
@@ -331,13 +366,31 @@ func (d *Daemon) ensureDeaconRunning() {
|
||||
return
|
||||
}
|
||||
|
||||
// Track when we started the Deacon to prevent race condition in checkDeaconHeartbeat.
|
||||
// The heartbeat file will still be stale until the Deacon runs a full patrol cycle.
|
||||
d.deaconLastStarted = time.Now()
|
||||
d.logger.Println("Deacon started successfully")
|
||||
}
|
||||
|
||||
// deaconGracePeriod is the time to wait after starting a Deacon before checking heartbeat.
|
||||
// The Deacon needs time to initialize Claude, run SessionStart hooks, execute gt prime,
|
||||
// run a patrol cycle, and write a fresh heartbeat. 5 minutes is conservative.
|
||||
const deaconGracePeriod = 5 * time.Minute
|
||||
|
||||
// checkDeaconHeartbeat checks if the Deacon is making progress.
|
||||
// This is a belt-and-suspenders fallback in case Boot doesn't detect stuck states.
|
||||
// Uses the heartbeat file that the Deacon updates on each patrol cycle.
|
||||
func (d *Daemon) checkDeaconHeartbeat() {
|
||||
// Grace period: don't check heartbeat for newly started sessions.
|
||||
// This prevents the race condition where we start a Deacon, then immediately
|
||||
// see a stale heartbeat (from before the crash) and kill the session we just started.
|
||||
// See: https://github.com/steveyegge/gastown/issues/567
|
||||
if !d.deaconLastStarted.IsZero() && time.Since(d.deaconLastStarted) < deaconGracePeriod {
|
||||
d.logger.Printf("Deacon started recently (%s ago), skipping heartbeat check",
|
||||
time.Since(d.deaconLastStarted).Round(time.Second))
|
||||
return
|
||||
}
|
||||
|
||||
hb := deacon.ReadHeartbeat(d.config.TownRoot)
|
||||
if hb == nil {
|
||||
// No heartbeat file - Deacon hasn't started a cycle yet
|
||||
@@ -415,7 +468,8 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
||||
|
||||
if err := mgr.Start(false, "", nil); err != nil {
|
||||
if err == witness.ErrAlreadyRunning {
|
||||
// Already running - nothing to do
|
||||
// Already running - this is the expected case
|
||||
d.logger.Printf("Witness for %s already running, skipping spawn", rigName)
|
||||
return
|
||||
}
|
||||
d.logger.Printf("Error starting witness for %s: %v", rigName, err)
|
||||
@@ -454,7 +508,8 @@ func (d *Daemon) ensureRefineryRunning(rigName string) {
|
||||
|
||||
if err := mgr.Start(false, ""); err != nil {
|
||||
if err == refinery.ErrAlreadyRunning {
|
||||
// Already running - nothing to do
|
||||
// Already running - this is the expected case when fix is working
|
||||
d.logger.Printf("Refinery for %s already running, skipping spawn", rigName)
|
||||
return
|
||||
}
|
||||
d.logger.Printf("Error starting refinery for %s: %v", rigName, err)
|
||||
|
||||
Reference in New Issue
Block a user