fix: Add grace period to prevent Deacon restart loop (#590)
* fix(daemon): prevent runaway refinery session spawning Fixes #566 The daemon spawned 812 refinery sessions over 4 days because: 1. Zombie detection was too strict - used IsAgentRunning(session, "node") but Claude reports pane command as version number (e.g., "2.1.7"), causing healthy sessions to be killed and recreated every heartbeat. 2. daemon.json patrol config was completely ignored - the daemon never loaded or checked the enabled flags. Changes: - refinery/manager.go: Use IsClaudeRunning() instead of IsAgentRunning() for robust Claude detection (handles "node", "claude", version patterns) - daemon/types.go: Add PatrolConfig types and LoadPatrolConfig() to read mayor/daemon.json - daemon/daemon.go: Load patrol config at startup, check enabled flags before calling ensureRefineriesRunning/ensureWitnessesRunning, add diagnostic logging for "already running" cases Tested: Verified over multiple heartbeats that refinery shows "already running, skipping spawn" instead of spawning new sessions. * fix: Add grace period to prevent Deacon restart loop The daemon had a race condition where: 1. ensureDeaconRunning() starts a new Deacon session 2. checkDeaconHeartbeat() runs in same heartbeat cycle 3. Heartbeat file is stale (from before crash) 4. Session is immediately killed 5. Infinite restart loop every 3 minutes Fix: - Track when Deacon was last started (deaconLastStarted field) - Skip heartbeat check during 5-minute grace period - Add config support for Deacon (consistency with refinery/witness) After grace period, normal heartbeat checking resumes. Genuinely stuck sessions (no heartbeat update after 5+ min) are still detected. Fixes #589 --------- Co-authored-by: mayor <your-github-email@example.com>
This commit is contained in:
@@ -96,6 +96,78 @@ func SaveState(townRoot string, state *State) error {
|
||||
return util.AtomicWriteJSON(stateFile, state)
|
||||
}
|
||||
|
||||
// PatrolConfig holds configuration for a single patrol.
|
||||
type PatrolConfig struct {
|
||||
// Enabled controls whether this patrol runs during heartbeat.
|
||||
Enabled bool `json:"enabled"`
|
||||
|
||||
// Interval is how often to run this patrol (not used yet).
|
||||
Interval string `json:"interval,omitempty"`
|
||||
|
||||
// Agent is the agent type for this patrol (not used yet).
|
||||
Agent string `json:"agent,omitempty"`
|
||||
}
|
||||
|
||||
// PatrolsConfig holds configuration for all patrols.
|
||||
type PatrolsConfig struct {
|
||||
Refinery *PatrolConfig `json:"refinery,omitempty"`
|
||||
Witness *PatrolConfig `json:"witness,omitempty"`
|
||||
Deacon *PatrolConfig `json:"deacon,omitempty"`
|
||||
}
|
||||
|
||||
// DaemonPatrolConfig is the structure of mayor/daemon.json.
|
||||
type DaemonPatrolConfig struct {
|
||||
Type string `json:"type"`
|
||||
Version int `json:"version"`
|
||||
Heartbeat *PatrolConfig `json:"heartbeat,omitempty"`
|
||||
Patrols *PatrolsConfig `json:"patrols,omitempty"`
|
||||
}
|
||||
|
||||
// PatrolConfigFile returns the path to the patrol config file.
|
||||
func PatrolConfigFile(townRoot string) string {
|
||||
return filepath.Join(townRoot, "mayor", "daemon.json")
|
||||
}
|
||||
|
||||
// LoadPatrolConfig loads patrol configuration from mayor/daemon.json.
|
||||
// Returns nil if the file doesn't exist or can't be parsed.
|
||||
func LoadPatrolConfig(townRoot string) *DaemonPatrolConfig {
|
||||
configFile := PatrolConfigFile(townRoot)
|
||||
data, err := os.ReadFile(configFile)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var config DaemonPatrolConfig
|
||||
if err := json.Unmarshal(data, &config); err != nil {
|
||||
return nil
|
||||
}
|
||||
return &config
|
||||
}
|
||||
|
||||
// IsPatrolEnabled checks if a patrol is enabled in the config.
|
||||
// Returns true if the config doesn't exist (default enabled for backwards compatibility).
|
||||
func IsPatrolEnabled(config *DaemonPatrolConfig, patrol string) bool {
|
||||
if config == nil || config.Patrols == nil {
|
||||
return true // Default: enabled
|
||||
}
|
||||
|
||||
switch patrol {
|
||||
case "refinery":
|
||||
if config.Patrols.Refinery != nil {
|
||||
return config.Patrols.Refinery.Enabled
|
||||
}
|
||||
case "witness":
|
||||
if config.Patrols.Witness != nil {
|
||||
return config.Patrols.Witness.Enabled
|
||||
}
|
||||
case "deacon":
|
||||
if config.Patrols.Deacon != nil {
|
||||
return config.Patrols.Deacon.Enabled
|
||||
}
|
||||
}
|
||||
return true // Default: enabled
|
||||
}
|
||||
|
||||
// LifecycleAction represents a lifecycle request action.
|
||||
type LifecycleAction string
|
||||
|
||||
|
||||
Reference in New Issue
Block a user