fix(deacon): improve health check reliability and error handling (#499)

Co-authored-by: Dylan <sigfawn@gmail.com>
This commit is contained in:
Steve Yegge
2026-01-13 22:34:03 -08:00
committed by GitHub
parent 92144757ac
commit 42d9890e5c

View File

@@ -1,6 +1,7 @@
package cmd
import (
"context"
"encoding/json"
"errors"
"fmt"
@@ -348,7 +349,7 @@ func startDeaconSession(t *tmux.Tmux, sessionName, agentOverride string) error {
// Ensure Claude settings exist (autonomous role needs mail in SessionStart)
if err := claude.EnsureSettingsForRole(deaconDir, "deacon"); err != nil {
style.PrintWarning("Could not create deacon settings: %v", err)
return fmt.Errorf("creating deacon settings: %w", err)
}
// Build startup command first
@@ -380,9 +381,9 @@ func startDeaconSession(t *tmux.Tmux, sessionName, agentOverride string) error {
theme := tmux.DeaconTheme()
_ = t.ConfigureGasTownSession(sessionName, theme, "", "Deacon", "health-check")
// Wait for Claude to start (non-fatal)
// Wait for Claude to start
if err := t.WaitForCommand(sessionName, constants.SupportedShells, constants.ClaudeStartTimeout); err != nil {
// Non-fatal
return fmt.Errorf("waiting for deacon to start: %w", err)
}
time.Sleep(constants.ShutdownNotifyDelay)
@@ -390,17 +391,21 @@ func startDeaconSession(t *tmux.Tmux, sessionName, agentOverride string) error {
_ = runtime.RunStartupFallback(t, sessionName, "deacon", runtimeConfig)
// Inject startup nudge for predecessor discovery via /resume
_ = session.StartupNudge(t, sessionName, session.StartupNudgeConfig{
if err := session.StartupNudge(t, sessionName, session.StartupNudgeConfig{
Recipient: "deacon",
Sender: "daemon",
Topic: "patrol",
}) // Non-fatal
}); err != nil {
style.PrintWarning("failed to send startup nudge: %v", err)
}
// GUPP: Gas Town Universal Propulsion Principle
// Send the propulsion nudge to trigger autonomous patrol execution.
// Wait for beacon to be fully processed (needs to be separate prompt)
time.Sleep(2 * time.Second)
_ = t.NudgeSession(sessionName, session.PropulsionNudgeForRole("deacon", deaconDir)) // Non-fatal
if err := t.NudgeSession(sessionName, session.PropulsionNudgeForRole("deacon", deaconDir)); err != nil {
return fmt.Errorf("sending propulsion nudge: %w", err)
}
return nil
}
@@ -698,25 +703,35 @@ func runDeaconHealthCheck(cmd *cobra.Command, args []string) error {
fmt.Printf("%s Sent HEALTH_CHECK to %s, waiting %s...\n",
style.Bold.Render("→"), agent, healthCheckTimeout)
// Wait for response
deadline := time.Now().Add(healthCheckTimeout)
// Wait for response using context and ticker for reliability
// This prevents loop hangs if system clock changes
ctx, cancel := context.WithTimeout(context.Background(), healthCheckTimeout)
defer cancel()
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
responded := false
for time.Now().Before(deadline) {
time.Sleep(2 * time.Second) // Check every 2 seconds
for {
select {
case <-ctx.Done():
goto Done
case <-ticker.C:
newTime, err := getAgentBeadUpdateTime(townRoot, beadID)
if err != nil {
continue
}
newTime, err := getAgentBeadUpdateTime(townRoot, beadID)
if err != nil {
continue
}
// If bead was updated after our baseline, agent responded
if newTime.After(baselineTime) {
responded = true
break
// If bead was updated after our baseline, agent responded
if newTime.After(baselineTime) {
responded = true
goto Done
}
}
}
Done:
// Record result
if responded {
agentState.RecordResponse()