fix: Add grace period to prevent Deacon restart loop (#590)

* fix(daemon): prevent runaway refinery session spawning

Fixes #566

The daemon spawned 812 refinery sessions over 4 days because:

1. Zombie detection was too strict - used IsAgentRunning(session, "node")
   but Claude reports pane command as version number (e.g., "2.1.7"),
   causing healthy sessions to be killed and recreated every heartbeat.

2. daemon.json patrol config was completely ignored - the daemon never
   loaded or checked the enabled flags.

Changes:
- refinery/manager.go: Use IsClaudeRunning() instead of IsAgentRunning()
  for robust Claude detection (handles "node", "claude", version patterns)
- daemon/types.go: Add PatrolConfig types and LoadPatrolConfig() to read
  mayor/daemon.json
- daemon/daemon.go: Load patrol config at startup, check enabled flags
  before calling ensureRefineriesRunning/ensureWitnessesRunning, add
  diagnostic logging for "already running" cases

Tested: Verified over multiple heartbeats that refinery shows "already
running, skipping spawn" instead of spawning new sessions.

* fix: Add grace period to prevent Deacon restart loop

The daemon had a race condition where:
1. ensureDeaconRunning() starts a new Deacon session
2. checkDeaconHeartbeat() runs in same heartbeat cycle
3. Heartbeat file is stale (from before crash)
4. Session is immediately killed
5. Infinite restart loop every 3 minutes

Fix:
- Track when Deacon was last started (deaconLastStarted field)
- Skip heartbeat check during 5-minute grace period
- Add config support for Deacon (consistency with refinery/witness)

After grace period, normal heartbeat checking resumes. Genuinely
stuck sessions (no heartbeat update after 5+ min) are still detected.

Fixes #589

---------

Co-authored-by: mayor <your-github-email@example.com>
This commit is contained in:
Walter McGivney
2026-01-16 18:27:41 -05:00
committed by GitHub
parent 91433e8b1d
commit 29f8dd67e2
4 changed files with 207 additions and 27 deletions

View File

@@ -37,17 +37,24 @@ import (
// This is recovery-focused: normal wake is handled by feed subscription (bd activity --follow).
// The daemon is the safety net for dead sessions, GUPP violations, and orphaned work.
type Daemon struct {
config *Config
tmux *tmux.Tmux
logger *log.Logger
ctx context.Context
cancel context.CancelFunc
curator *feed.Curator
config *Config
patrolConfig *DaemonPatrolConfig
tmux *tmux.Tmux
logger *log.Logger
ctx context.Context
cancel context.CancelFunc
curator *feed.Curator
convoyWatcher *ConvoyWatcher
// Mass death detection: track recent session deaths
deathsMu sync.Mutex
recentDeaths []sessionDeath
// Deacon startup tracking: prevents race condition where newly started
// sessions are immediately killed by the heartbeat check.
// See: https://github.com/steveyegge/gastown/issues/567
// Note: Only accessed from heartbeat loop goroutine - no sync needed.
deaconLastStarted time.Time
}
// sessionDeath records a detected session death for mass death analysis.
@@ -79,12 +86,19 @@ func New(config *Config) (*Daemon, error) {
logger := log.New(logFile, "", log.LstdFlags)
ctx, cancel := context.WithCancel(context.Background())
// Load patrol config from mayor/daemon.json (optional - nil if missing)
patrolConfig := LoadPatrolConfig(config.TownRoot)
if patrolConfig != nil {
logger.Printf("Loaded patrol config from %s", PatrolConfigFile(config.TownRoot))
}
return &Daemon{
config: config,
tmux: tmux.NewTmux(),
logger: logger,
ctx: ctx,
cancel: cancel,
config: config,
patrolConfig: patrolConfig,
tmux: tmux.NewTmux(),
logger: logger,
ctx: ctx,
cancel: cancel,
}, nil
}
@@ -197,21 +211,42 @@ func (d *Daemon) heartbeat(state *State) {
d.logger.Println("Heartbeat starting (recovery-focused)")
// 1. Ensure Deacon is running (restart if dead)
d.ensureDeaconRunning()
// Check patrol config - can be disabled in mayor/daemon.json
if IsPatrolEnabled(d.patrolConfig, "deacon") {
d.ensureDeaconRunning()
} else {
d.logger.Printf("Deacon patrol disabled in config, skipping")
}
// 2. Poke Boot for intelligent triage (stuck/nudge/interrupt)
// Boot handles nuanced "is Deacon responsive" decisions
d.ensureBootRunning()
// Only run if Deacon patrol is enabled
if IsPatrolEnabled(d.patrolConfig, "deacon") {
d.ensureBootRunning()
}
// 3. Direct Deacon heartbeat check (belt-and-suspenders)
// Boot may not detect all stuck states; this provides a fallback
d.checkDeaconHeartbeat()
// Only run if Deacon patrol is enabled
if IsPatrolEnabled(d.patrolConfig, "deacon") {
d.checkDeaconHeartbeat()
}
// 4. Ensure Witnesses are running for all rigs (restart if dead)
d.ensureWitnessesRunning()
// Check patrol config - can be disabled in mayor/daemon.json
if IsPatrolEnabled(d.patrolConfig, "witness") {
d.ensureWitnessesRunning()
} else {
d.logger.Printf("Witness patrol disabled in config, skipping")
}
// 5. Ensure Refineries are running for all rigs (restart if dead)
d.ensureRefineriesRunning()
// Check patrol config - can be disabled in mayor/daemon.json
if IsPatrolEnabled(d.patrolConfig, "refinery") {
d.ensureRefineriesRunning()
} else {
d.logger.Printf("Refinery patrol disabled in config, skipping")
}
// 6. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
// This ensures polecats get nudged even when Deacon isn't in a patrol cycle.
@@ -331,13 +366,31 @@ func (d *Daemon) ensureDeaconRunning() {
return
}
// Track when we started the Deacon to prevent race condition in checkDeaconHeartbeat.
// The heartbeat file will still be stale until the Deacon runs a full patrol cycle.
d.deaconLastStarted = time.Now()
d.logger.Println("Deacon started successfully")
}
// deaconGracePeriod is the time to wait after starting a Deacon before checking heartbeat.
// The Deacon needs time to initialize Claude, run SessionStart hooks, execute gt prime,
// run a patrol cycle, and write a fresh heartbeat. 5 minutes is conservative.
const deaconGracePeriod = 5 * time.Minute
// checkDeaconHeartbeat checks if the Deacon is making progress.
// This is a belt-and-suspenders fallback in case Boot doesn't detect stuck states.
// Uses the heartbeat file that the Deacon updates on each patrol cycle.
func (d *Daemon) checkDeaconHeartbeat() {
// Grace period: don't check heartbeat for newly started sessions.
// This prevents the race condition where we start a Deacon, then immediately
// see a stale heartbeat (from before the crash) and kill the session we just started.
// See: https://github.com/steveyegge/gastown/issues/567
if !d.deaconLastStarted.IsZero() && time.Since(d.deaconLastStarted) < deaconGracePeriod {
d.logger.Printf("Deacon started recently (%s ago), skipping heartbeat check",
time.Since(d.deaconLastStarted).Round(time.Second))
return
}
hb := deacon.ReadHeartbeat(d.config.TownRoot)
if hb == nil {
// No heartbeat file - Deacon hasn't started a cycle yet
@@ -415,7 +468,8 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
if err := mgr.Start(false, "", nil); err != nil {
if err == witness.ErrAlreadyRunning {
// Already running - nothing to do
// Already running - this is the expected case
d.logger.Printf("Witness for %s already running, skipping spawn", rigName)
return
}
d.logger.Printf("Error starting witness for %s: %v", rigName, err)
@@ -454,7 +508,8 @@ func (d *Daemon) ensureRefineryRunning(rigName string) {
if err := mgr.Start(false, ""); err != nil {
if err == refinery.ErrAlreadyRunning {
// Already running - nothing to do
// Already running - this is the expected case when fix is working
d.logger.Printf("Refinery for %s already running, skipping spawn", rigName)
return
}
d.logger.Printf("Error starting refinery for %s: %v", rigName, err)

View File

@@ -0,0 +1,53 @@
package daemon
import (
"os"
"path/filepath"
"testing"
)
func TestLoadPatrolConfig(t *testing.T) {
// Create a temp dir with test config
tmpDir := t.TempDir()
mayorDir := filepath.Join(tmpDir, "mayor")
if err := os.MkdirAll(mayorDir, 0755); err != nil {
t.Fatal(err)
}
// Write test config
configJSON := `{
"type": "daemon-patrol-config",
"version": 1,
"patrols": {
"refinery": {"enabled": false},
"witness": {"enabled": true}
}
}`
if err := os.WriteFile(filepath.Join(mayorDir, "daemon.json"), []byte(configJSON), 0644); err != nil {
t.Fatal(err)
}
// Load config
config := LoadPatrolConfig(tmpDir)
if config == nil {
t.Fatal("expected config to be loaded")
}
// Test enabled flags
if IsPatrolEnabled(config, "refinery") {
t.Error("expected refinery to be disabled")
}
if !IsPatrolEnabled(config, "witness") {
t.Error("expected witness to be enabled")
}
if !IsPatrolEnabled(config, "deacon") {
t.Error("expected deacon to be enabled (default)")
}
}
func TestIsPatrolEnabled_NilConfig(t *testing.T) {
// Should default to enabled when config is nil
if !IsPatrolEnabled(nil, "refinery") {
t.Error("expected default to be enabled")
}
}

View File

@@ -96,6 +96,78 @@ func SaveState(townRoot string, state *State) error {
return util.AtomicWriteJSON(stateFile, state)
}
// PatrolConfig holds configuration for a single patrol.
type PatrolConfig struct {
// Enabled controls whether this patrol runs during heartbeat.
Enabled bool `json:"enabled"`
// Interval is how often to run this patrol (not used yet).
Interval string `json:"interval,omitempty"`
// Agent is the agent type for this patrol (not used yet).
Agent string `json:"agent,omitempty"`
}
// PatrolsConfig holds configuration for all patrols.
type PatrolsConfig struct {
Refinery *PatrolConfig `json:"refinery,omitempty"`
Witness *PatrolConfig `json:"witness,omitempty"`
Deacon *PatrolConfig `json:"deacon,omitempty"`
}
// DaemonPatrolConfig is the structure of mayor/daemon.json.
type DaemonPatrolConfig struct {
Type string `json:"type"`
Version int `json:"version"`
Heartbeat *PatrolConfig `json:"heartbeat,omitempty"`
Patrols *PatrolsConfig `json:"patrols,omitempty"`
}
// PatrolConfigFile returns the path to the patrol config file.
func PatrolConfigFile(townRoot string) string {
return filepath.Join(townRoot, "mayor", "daemon.json")
}
// LoadPatrolConfig loads patrol configuration from mayor/daemon.json.
// Returns nil if the file doesn't exist or can't be parsed.
func LoadPatrolConfig(townRoot string) *DaemonPatrolConfig {
configFile := PatrolConfigFile(townRoot)
data, err := os.ReadFile(configFile)
if err != nil {
return nil
}
var config DaemonPatrolConfig
if err := json.Unmarshal(data, &config); err != nil {
return nil
}
return &config
}
// IsPatrolEnabled checks if a patrol is enabled in the config.
// Returns true if the config doesn't exist (default enabled for backwards compatibility).
func IsPatrolEnabled(config *DaemonPatrolConfig, patrol string) bool {
if config == nil || config.Patrols == nil {
return true // Default: enabled
}
switch patrol {
case "refinery":
if config.Patrols.Refinery != nil {
return config.Patrols.Refinery.Enabled
}
case "witness":
if config.Patrols.Witness != nil {
return config.Patrols.Witness.Enabled
}
case "deacon":
if config.Patrols.Deacon != nil {
return config.Patrols.Deacon.Enabled
}
}
return true // Default: enabled
}
// LifecycleAction represents a lifecycle request action.
type LifecycleAction string