fix: Add grace period to prevent Deacon restart loop (#590)
* fix(daemon): prevent runaway refinery session spawning Fixes #566 The daemon spawned 812 refinery sessions over 4 days because: 1. Zombie detection was too strict - used IsAgentRunning(session, "node") but Claude reports pane command as version number (e.g., "2.1.7"), causing healthy sessions to be killed and recreated every heartbeat. 2. daemon.json patrol config was completely ignored - the daemon never loaded or checked the enabled flags. Changes: - refinery/manager.go: Use IsClaudeRunning() instead of IsAgentRunning() for robust Claude detection (handles "node", "claude", version patterns) - daemon/types.go: Add PatrolConfig types and LoadPatrolConfig() to read mayor/daemon.json - daemon/daemon.go: Load patrol config at startup, check enabled flags before calling ensureRefineriesRunning/ensureWitnessesRunning, add diagnostic logging for "already running" cases Tested: Verified over multiple heartbeats that refinery shows "already running, skipping spawn" instead of spawning new sessions. * fix: Add grace period to prevent Deacon restart loop The daemon had a race condition where: 1. ensureDeaconRunning() starts a new Deacon session 2. checkDeaconHeartbeat() runs in same heartbeat cycle 3. Heartbeat file is stale (from before crash) 4. Session is immediately killed 5. Infinite restart loop every 3 minutes Fix: - Track when Deacon was last started (deaconLastStarted field) - Skip heartbeat check during 5-minute grace period - Add config support for Deacon (consistency with refinery/witness) After grace period, normal heartbeat checking resumes. Genuinely stuck sessions (no heartbeat update after 5+ min) are still detected. Fixes #589 --------- Co-authored-by: mayor <your-github-email@example.com>
This commit is contained in:
53
internal/daemon/patrol_config_test.go
Normal file
53
internal/daemon/patrol_config_test.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package daemon
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadPatrolConfig(t *testing.T) {
|
||||
// Create a temp dir with test config
|
||||
tmpDir := t.TempDir()
|
||||
mayorDir := filepath.Join(tmpDir, "mayor")
|
||||
if err := os.MkdirAll(mayorDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Write test config
|
||||
configJSON := `{
|
||||
"type": "daemon-patrol-config",
|
||||
"version": 1,
|
||||
"patrols": {
|
||||
"refinery": {"enabled": false},
|
||||
"witness": {"enabled": true}
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(mayorDir, "daemon.json"), []byte(configJSON), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Load config
|
||||
config := LoadPatrolConfig(tmpDir)
|
||||
if config == nil {
|
||||
t.Fatal("expected config to be loaded")
|
||||
}
|
||||
|
||||
// Test enabled flags
|
||||
if IsPatrolEnabled(config, "refinery") {
|
||||
t.Error("expected refinery to be disabled")
|
||||
}
|
||||
if !IsPatrolEnabled(config, "witness") {
|
||||
t.Error("expected witness to be enabled")
|
||||
}
|
||||
if !IsPatrolEnabled(config, "deacon") {
|
||||
t.Error("expected deacon to be enabled (default)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsPatrolEnabled_NilConfig(t *testing.T) {
|
||||
// Should default to enabled when config is nil
|
||||
if !IsPatrolEnabled(nil, "refinery") {
|
||||
t.Error("expected default to be enabled")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user