fix(deacon): improve health check reliability and error handling (#499)
Co-authored-by: Dylan <sigfawn@gmail.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -348,7 +349,7 @@ func startDeaconSession(t *tmux.Tmux, sessionName, agentOverride string) error {
|
||||
|
||||
// Ensure Claude settings exist (autonomous role needs mail in SessionStart)
|
||||
if err := claude.EnsureSettingsForRole(deaconDir, "deacon"); err != nil {
|
||||
style.PrintWarning("Could not create deacon settings: %v", err)
|
||||
return fmt.Errorf("creating deacon settings: %w", err)
|
||||
}
|
||||
|
||||
// Build startup command first
|
||||
@@ -380,9 +381,9 @@ func startDeaconSession(t *tmux.Tmux, sessionName, agentOverride string) error {
|
||||
theme := tmux.DeaconTheme()
|
||||
_ = t.ConfigureGasTownSession(sessionName, theme, "", "Deacon", "health-check")
|
||||
|
||||
// Wait for Claude to start (non-fatal)
|
||||
// Wait for Claude to start
|
||||
if err := t.WaitForCommand(sessionName, constants.SupportedShells, constants.ClaudeStartTimeout); err != nil {
|
||||
// Non-fatal
|
||||
return fmt.Errorf("waiting for deacon to start: %w", err)
|
||||
}
|
||||
time.Sleep(constants.ShutdownNotifyDelay)
|
||||
|
||||
@@ -390,17 +391,21 @@ func startDeaconSession(t *tmux.Tmux, sessionName, agentOverride string) error {
|
||||
_ = runtime.RunStartupFallback(t, sessionName, "deacon", runtimeConfig)
|
||||
|
||||
// Inject startup nudge for predecessor discovery via /resume
|
||||
_ = session.StartupNudge(t, sessionName, session.StartupNudgeConfig{
|
||||
if err := session.StartupNudge(t, sessionName, session.StartupNudgeConfig{
|
||||
Recipient: "deacon",
|
||||
Sender: "daemon",
|
||||
Topic: "patrol",
|
||||
}) // Non-fatal
|
||||
}); err != nil {
|
||||
style.PrintWarning("failed to send startup nudge: %v", err)
|
||||
}
|
||||
|
||||
// GUPP: Gas Town Universal Propulsion Principle
|
||||
// Send the propulsion nudge to trigger autonomous patrol execution.
|
||||
// Wait for beacon to be fully processed (needs to be separate prompt)
|
||||
time.Sleep(2 * time.Second)
|
||||
_ = t.NudgeSession(sessionName, session.PropulsionNudgeForRole("deacon", deaconDir)) // Non-fatal
|
||||
if err := t.NudgeSession(sessionName, session.PropulsionNudgeForRole("deacon", deaconDir)); err != nil {
|
||||
return fmt.Errorf("sending propulsion nudge: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -698,25 +703,35 @@ func runDeaconHealthCheck(cmd *cobra.Command, args []string) error {
|
||||
fmt.Printf("%s Sent HEALTH_CHECK to %s, waiting %s...\n",
|
||||
style.Bold.Render("→"), agent, healthCheckTimeout)
|
||||
|
||||
// Wait for response
|
||||
deadline := time.Now().Add(healthCheckTimeout)
|
||||
// Wait for response using context and ticker for reliability
|
||||
// This prevents loop hangs if system clock changes
|
||||
ctx, cancel := context.WithTimeout(context.Background(), healthCheckTimeout)
|
||||
defer cancel()
|
||||
|
||||
ticker := time.NewTicker(2 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
responded := false
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
time.Sleep(2 * time.Second) // Check every 2 seconds
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
goto Done
|
||||
case <-ticker.C:
|
||||
newTime, err := getAgentBeadUpdateTime(townRoot, beadID)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
newTime, err := getAgentBeadUpdateTime(townRoot, beadID)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// If bead was updated after our baseline, agent responded
|
||||
if newTime.After(baselineTime) {
|
||||
responded = true
|
||||
break
|
||||
// If bead was updated after our baseline, agent responded
|
||||
if newTime.After(baselineTime) {
|
||||
responded = true
|
||||
goto Done
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Done:
|
||||
// Record result
|
||||
if responded {
|
||||
agentState.RecordResponse()
|
||||
|
||||
Reference in New Issue
Block a user