fix: Add grace period to prevent Deacon restart loop (#590)
* fix(daemon): prevent runaway refinery session spawning Fixes #566 The daemon spawned 812 refinery sessions over 4 days because: 1. Zombie detection was too strict - used IsAgentRunning(session, "node") but Claude reports pane command as version number (e.g., "2.1.7"), causing healthy sessions to be killed and recreated every heartbeat. 2. daemon.json patrol config was completely ignored - the daemon never loaded or checked the enabled flags. Changes: - refinery/manager.go: Use IsClaudeRunning() instead of IsAgentRunning() for robust Claude detection (handles "node", "claude", version patterns) - daemon/types.go: Add PatrolConfig types and LoadPatrolConfig() to read mayor/daemon.json - daemon/daemon.go: Load patrol config at startup, check enabled flags before calling ensureRefineriesRunning/ensureWitnessesRunning, add diagnostic logging for "already running" cases Tested: Verified over multiple heartbeats that refinery shows "already running, skipping spawn" instead of spawning new sessions. * fix: Add grace period to prevent Deacon restart loop The daemon had a race condition where: 1. ensureDeaconRunning() starts a new Deacon session 2. checkDeaconHeartbeat() runs in same heartbeat cycle 3. Heartbeat file is stale (from before crash) 4. Session is immediately killed 5. Infinite restart loop every 3 minutes Fix: - Track when Deacon was last started (deaconLastStarted field) - Skip heartbeat check during 5-minute grace period - Add config support for Deacon (consistency with refinery/witness) After grace period, normal heartbeat checking resumes. Genuinely stuck sessions (no heartbeat update after 5+ min) are still detected. Fixes #589 --------- Co-authored-by: mayor <your-github-email@example.com>
This commit is contained in:
@@ -37,17 +37,24 @@ import (
|
||||
// This is recovery-focused: normal wake is handled by feed subscription (bd activity --follow).
|
||||
// The daemon is the safety net for dead sessions, GUPP violations, and orphaned work.
|
||||
type Daemon struct {
|
||||
config *Config
|
||||
tmux *tmux.Tmux
|
||||
logger *log.Logger
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
curator *feed.Curator
|
||||
config *Config
|
||||
patrolConfig *DaemonPatrolConfig
|
||||
tmux *tmux.Tmux
|
||||
logger *log.Logger
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
curator *feed.Curator
|
||||
convoyWatcher *ConvoyWatcher
|
||||
|
||||
// Mass death detection: track recent session deaths
|
||||
deathsMu sync.Mutex
|
||||
recentDeaths []sessionDeath
|
||||
|
||||
// Deacon startup tracking: prevents race condition where newly started
|
||||
// sessions are immediately killed by the heartbeat check.
|
||||
// See: https://github.com/steveyegge/gastown/issues/567
|
||||
// Note: Only accessed from heartbeat loop goroutine - no sync needed.
|
||||
deaconLastStarted time.Time
|
||||
}
|
||||
|
||||
// sessionDeath records a detected session death for mass death analysis.
|
||||
@@ -79,12 +86,19 @@ func New(config *Config) (*Daemon, error) {
|
||||
logger := log.New(logFile, "", log.LstdFlags)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// Load patrol config from mayor/daemon.json (optional - nil if missing)
|
||||
patrolConfig := LoadPatrolConfig(config.TownRoot)
|
||||
if patrolConfig != nil {
|
||||
logger.Printf("Loaded patrol config from %s", PatrolConfigFile(config.TownRoot))
|
||||
}
|
||||
|
||||
return &Daemon{
|
||||
config: config,
|
||||
tmux: tmux.NewTmux(),
|
||||
logger: logger,
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
config: config,
|
||||
patrolConfig: patrolConfig,
|
||||
tmux: tmux.NewTmux(),
|
||||
logger: logger,
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -197,21 +211,42 @@ func (d *Daemon) heartbeat(state *State) {
|
||||
d.logger.Println("Heartbeat starting (recovery-focused)")
|
||||
|
||||
// 1. Ensure Deacon is running (restart if dead)
|
||||
d.ensureDeaconRunning()
|
||||
// Check patrol config - can be disabled in mayor/daemon.json
|
||||
if IsPatrolEnabled(d.patrolConfig, "deacon") {
|
||||
d.ensureDeaconRunning()
|
||||
} else {
|
||||
d.logger.Printf("Deacon patrol disabled in config, skipping")
|
||||
}
|
||||
|
||||
// 2. Poke Boot for intelligent triage (stuck/nudge/interrupt)
|
||||
// Boot handles nuanced "is Deacon responsive" decisions
|
||||
d.ensureBootRunning()
|
||||
// Only run if Deacon patrol is enabled
|
||||
if IsPatrolEnabled(d.patrolConfig, "deacon") {
|
||||
d.ensureBootRunning()
|
||||
}
|
||||
|
||||
// 3. Direct Deacon heartbeat check (belt-and-suspenders)
|
||||
// Boot may not detect all stuck states; this provides a fallback
|
||||
d.checkDeaconHeartbeat()
|
||||
// Only run if Deacon patrol is enabled
|
||||
if IsPatrolEnabled(d.patrolConfig, "deacon") {
|
||||
d.checkDeaconHeartbeat()
|
||||
}
|
||||
|
||||
// 4. Ensure Witnesses are running for all rigs (restart if dead)
|
||||
d.ensureWitnessesRunning()
|
||||
// Check patrol config - can be disabled in mayor/daemon.json
|
||||
if IsPatrolEnabled(d.patrolConfig, "witness") {
|
||||
d.ensureWitnessesRunning()
|
||||
} else {
|
||||
d.logger.Printf("Witness patrol disabled in config, skipping")
|
||||
}
|
||||
|
||||
// 5. Ensure Refineries are running for all rigs (restart if dead)
|
||||
d.ensureRefineriesRunning()
|
||||
// Check patrol config - can be disabled in mayor/daemon.json
|
||||
if IsPatrolEnabled(d.patrolConfig, "refinery") {
|
||||
d.ensureRefineriesRunning()
|
||||
} else {
|
||||
d.logger.Printf("Refinery patrol disabled in config, skipping")
|
||||
}
|
||||
|
||||
// 6. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
|
||||
// This ensures polecats get nudged even when Deacon isn't in a patrol cycle.
|
||||
@@ -331,13 +366,31 @@ func (d *Daemon) ensureDeaconRunning() {
|
||||
return
|
||||
}
|
||||
|
||||
// Track when we started the Deacon to prevent race condition in checkDeaconHeartbeat.
|
||||
// The heartbeat file will still be stale until the Deacon runs a full patrol cycle.
|
||||
d.deaconLastStarted = time.Now()
|
||||
d.logger.Println("Deacon started successfully")
|
||||
}
|
||||
|
||||
// deaconGracePeriod is the time to wait after starting a Deacon before checking heartbeat.
|
||||
// The Deacon needs time to initialize Claude, run SessionStart hooks, execute gt prime,
|
||||
// run a patrol cycle, and write a fresh heartbeat. 5 minutes is conservative.
|
||||
const deaconGracePeriod = 5 * time.Minute
|
||||
|
||||
// checkDeaconHeartbeat checks if the Deacon is making progress.
|
||||
// This is a belt-and-suspenders fallback in case Boot doesn't detect stuck states.
|
||||
// Uses the heartbeat file that the Deacon updates on each patrol cycle.
|
||||
func (d *Daemon) checkDeaconHeartbeat() {
|
||||
// Grace period: don't check heartbeat for newly started sessions.
|
||||
// This prevents the race condition where we start a Deacon, then immediately
|
||||
// see a stale heartbeat (from before the crash) and kill the session we just started.
|
||||
// See: https://github.com/steveyegge/gastown/issues/567
|
||||
if !d.deaconLastStarted.IsZero() && time.Since(d.deaconLastStarted) < deaconGracePeriod {
|
||||
d.logger.Printf("Deacon started recently (%s ago), skipping heartbeat check",
|
||||
time.Since(d.deaconLastStarted).Round(time.Second))
|
||||
return
|
||||
}
|
||||
|
||||
hb := deacon.ReadHeartbeat(d.config.TownRoot)
|
||||
if hb == nil {
|
||||
// No heartbeat file - Deacon hasn't started a cycle yet
|
||||
@@ -415,7 +468,8 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
||||
|
||||
if err := mgr.Start(false, "", nil); err != nil {
|
||||
if err == witness.ErrAlreadyRunning {
|
||||
// Already running - nothing to do
|
||||
// Already running - this is the expected case
|
||||
d.logger.Printf("Witness for %s already running, skipping spawn", rigName)
|
||||
return
|
||||
}
|
||||
d.logger.Printf("Error starting witness for %s: %v", rigName, err)
|
||||
@@ -454,7 +508,8 @@ func (d *Daemon) ensureRefineryRunning(rigName string) {
|
||||
|
||||
if err := mgr.Start(false, ""); err != nil {
|
||||
if err == refinery.ErrAlreadyRunning {
|
||||
// Already running - nothing to do
|
||||
// Already running - this is the expected case when fix is working
|
||||
d.logger.Printf("Refinery for %s already running, skipping spawn", rigName)
|
||||
return
|
||||
}
|
||||
d.logger.Printf("Error starting refinery for %s: %v", rigName, err)
|
||||
|
||||
53
internal/daemon/patrol_config_test.go
Normal file
53
internal/daemon/patrol_config_test.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package daemon
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadPatrolConfig(t *testing.T) {
|
||||
// Create a temp dir with test config
|
||||
tmpDir := t.TempDir()
|
||||
mayorDir := filepath.Join(tmpDir, "mayor")
|
||||
if err := os.MkdirAll(mayorDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Write test config
|
||||
configJSON := `{
|
||||
"type": "daemon-patrol-config",
|
||||
"version": 1,
|
||||
"patrols": {
|
||||
"refinery": {"enabled": false},
|
||||
"witness": {"enabled": true}
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(mayorDir, "daemon.json"), []byte(configJSON), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Load config
|
||||
config := LoadPatrolConfig(tmpDir)
|
||||
if config == nil {
|
||||
t.Fatal("expected config to be loaded")
|
||||
}
|
||||
|
||||
// Test enabled flags
|
||||
if IsPatrolEnabled(config, "refinery") {
|
||||
t.Error("expected refinery to be disabled")
|
||||
}
|
||||
if !IsPatrolEnabled(config, "witness") {
|
||||
t.Error("expected witness to be enabled")
|
||||
}
|
||||
if !IsPatrolEnabled(config, "deacon") {
|
||||
t.Error("expected deacon to be enabled (default)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsPatrolEnabled_NilConfig(t *testing.T) {
|
||||
// Should default to enabled when config is nil
|
||||
if !IsPatrolEnabled(nil, "refinery") {
|
||||
t.Error("expected default to be enabled")
|
||||
}
|
||||
}
|
||||
@@ -96,6 +96,78 @@ func SaveState(townRoot string, state *State) error {
|
||||
return util.AtomicWriteJSON(stateFile, state)
|
||||
}
|
||||
|
||||
// PatrolConfig holds configuration for a single patrol.
|
||||
type PatrolConfig struct {
|
||||
// Enabled controls whether this patrol runs during heartbeat.
|
||||
Enabled bool `json:"enabled"`
|
||||
|
||||
// Interval is how often to run this patrol (not used yet).
|
||||
Interval string `json:"interval,omitempty"`
|
||||
|
||||
// Agent is the agent type for this patrol (not used yet).
|
||||
Agent string `json:"agent,omitempty"`
|
||||
}
|
||||
|
||||
// PatrolsConfig holds configuration for all patrols.
|
||||
type PatrolsConfig struct {
|
||||
Refinery *PatrolConfig `json:"refinery,omitempty"`
|
||||
Witness *PatrolConfig `json:"witness,omitempty"`
|
||||
Deacon *PatrolConfig `json:"deacon,omitempty"`
|
||||
}
|
||||
|
||||
// DaemonPatrolConfig is the structure of mayor/daemon.json.
|
||||
type DaemonPatrolConfig struct {
|
||||
Type string `json:"type"`
|
||||
Version int `json:"version"`
|
||||
Heartbeat *PatrolConfig `json:"heartbeat,omitempty"`
|
||||
Patrols *PatrolsConfig `json:"patrols,omitempty"`
|
||||
}
|
||||
|
||||
// PatrolConfigFile returns the path to the patrol config file.
|
||||
func PatrolConfigFile(townRoot string) string {
|
||||
return filepath.Join(townRoot, "mayor", "daemon.json")
|
||||
}
|
||||
|
||||
// LoadPatrolConfig loads patrol configuration from mayor/daemon.json.
|
||||
// Returns nil if the file doesn't exist or can't be parsed.
|
||||
func LoadPatrolConfig(townRoot string) *DaemonPatrolConfig {
|
||||
configFile := PatrolConfigFile(townRoot)
|
||||
data, err := os.ReadFile(configFile)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var config DaemonPatrolConfig
|
||||
if err := json.Unmarshal(data, &config); err != nil {
|
||||
return nil
|
||||
}
|
||||
return &config
|
||||
}
|
||||
|
||||
// IsPatrolEnabled checks if a patrol is enabled in the config.
|
||||
// Returns true if the config doesn't exist (default enabled for backwards compatibility).
|
||||
func IsPatrolEnabled(config *DaemonPatrolConfig, patrol string) bool {
|
||||
if config == nil || config.Patrols == nil {
|
||||
return true // Default: enabled
|
||||
}
|
||||
|
||||
switch patrol {
|
||||
case "refinery":
|
||||
if config.Patrols.Refinery != nil {
|
||||
return config.Patrols.Refinery.Enabled
|
||||
}
|
||||
case "witness":
|
||||
if config.Patrols.Witness != nil {
|
||||
return config.Patrols.Witness.Enabled
|
||||
}
|
||||
case "deacon":
|
||||
if config.Patrols.Deacon != nil {
|
||||
return config.Patrols.Deacon.Enabled
|
||||
}
|
||||
}
|
||||
return true // Default: enabled
|
||||
}
|
||||
|
||||
// LifecycleAction represents a lifecycle request action.
|
||||
type LifecycleAction string
|
||||
|
||||
|
||||
Reference in New Issue
Block a user