Daemon heartbeat becomes recovery-focused (gt-vdprb.4)
Change daemon from wake-focused to recovery-focused: Before: Daemon pokes agents every 5-60min as primary wake After: Daemon only checks for edge cases that feed-wake cannot handle Recovery checks: - Dead sessions that need restart (ensureDeaconRunning, ensureWitnessesRunning) - Stale agents that crashed without updating state (checkStaleAgents) - GUPP violations: agents with work-on-hook not progressing (checkGUPPViolations) - Orphaned work: work assigned to dead agents (checkOrphanedWork) Removed: - pokeDeacon() - no longer sending HEARTBEAT messages - pokeWitness()/pokeWitnesses() - no longer sending HEARTBEAT messages - MOTD message arrays - only used by poke functions Normal agent wake is now handled by feed subscription (bd activity --follow). The daemon is the safety net for edge cases, not the primary propulsion. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -20,15 +20,15 @@ import (
|
||||
)
|
||||
|
||||
// Daemon is the town-level background service.
|
||||
// It ensures patrol agents (Deacon, Witnesses) are running and sends periodic heartbeats.
|
||||
// All health checking, nudging, and decision-making belongs in the patrol molecules.
|
||||
// It ensures patrol agents (Deacon, Witnesses) are running and detects failures.
|
||||
// This is recovery-focused: normal wake is handled by feed subscription (bd activity --follow).
|
||||
// The daemon is the safety net for dead sessions, GUPP violations, and orphaned work.
|
||||
type Daemon struct {
|
||||
config *Config
|
||||
tmux *tmux.Tmux
|
||||
logger *log.Logger
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
lastMOTDIndex int // tracks last MOTD to avoid consecutive repeats
|
||||
config *Config
|
||||
tmux *tmux.Tmux
|
||||
logger *log.Logger
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
// New creates a new daemon instance.
|
||||
@@ -169,35 +169,39 @@ func (d *Daemon) calculateHeartbeatInterval() time.Duration {
|
||||
}
|
||||
|
||||
// heartbeat performs one heartbeat cycle.
|
||||
// The daemon ensures patrol agents are running and sends heartbeats.
|
||||
// All health checking and decision-making belongs in the patrol molecules.
|
||||
// The daemon is recovery-focused: it ensures agents are running and detects failures.
|
||||
// Normal wake is handled by feed subscription (bd activity --follow).
|
||||
// The daemon is the safety net for edge cases:
|
||||
// - Dead sessions that need restart
|
||||
// - Agents with work-on-hook not progressing (GUPP violation)
|
||||
// - Orphaned work (assigned to dead agents)
|
||||
func (d *Daemon) heartbeat(state *State) {
|
||||
d.logger.Println("Heartbeat starting")
|
||||
d.logger.Println("Heartbeat starting (recovery-focused)")
|
||||
|
||||
// 1. Ensure Deacon is running (process management)
|
||||
// 1. Ensure Deacon is running (restart if dead)
|
||||
d.ensureDeaconRunning()
|
||||
|
||||
// 2. Send heartbeat to Deacon (simple notification, no decision-making)
|
||||
d.pokeDeacon()
|
||||
|
||||
// 3. Ensure Witnesses are running for all rigs
|
||||
// 2. Ensure Witnesses are running for all rigs (restart if dead)
|
||||
d.ensureWitnessesRunning()
|
||||
|
||||
// 4. Send heartbeats to Witnesses
|
||||
d.pokeWitnesses()
|
||||
|
||||
// 5. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
|
||||
// 3. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
|
||||
// This ensures polecats get nudged even when Deacon isn't in a patrol cycle.
|
||||
// Uses regex-based WaitForClaudeReady, which is acceptable for daemon bootstrap.
|
||||
d.triggerPendingSpawns()
|
||||
|
||||
// 6. Process lifecycle requests
|
||||
// 4. Process lifecycle requests
|
||||
d.processLifecycleRequests()
|
||||
|
||||
// 7. Check for stale agents (timeout fallback)
|
||||
// 5. Check for stale agents (timeout fallback)
|
||||
// Agents that report "running" but haven't updated in too long are marked dead
|
||||
d.checkStaleAgents()
|
||||
|
||||
// 6. Check for GUPP violations (agents with work-on-hook not progressing)
|
||||
d.checkGUPPViolations()
|
||||
|
||||
// 7. Check for orphaned work (assigned to dead agents)
|
||||
d.checkOrphanedWork()
|
||||
|
||||
// Update state
|
||||
state.LastHeartbeat = time.Now()
|
||||
state.HeartbeatCount++
|
||||
@@ -214,38 +218,6 @@ const DeaconSessionName = "gt-deacon"
|
||||
// DeaconRole is the role name for the Deacon's handoff bead.
|
||||
const DeaconRole = "deacon"
|
||||
|
||||
// deaconMOTDMessages contains rotating motivational and educational tips
|
||||
// for the Deacon heartbeat. These make the thankless patrol role more fun.
|
||||
var deaconMOTDMessages = []string{
|
||||
"Thanks for keeping the town running!",
|
||||
"You are Gas Town's most critical role.",
|
||||
"You are the heart of Gas Town! Be watchful!",
|
||||
"Tip: Polecats are transient - spawn freely, kill liberally.",
|
||||
"Tip: Witnesses monitor polecats; you monitor witnesses.",
|
||||
"Tip: Wisps are transient molecules for patrol cycles.",
|
||||
"The town sleeps soundly because you never do.",
|
||||
"Tip: Mayor handles cross-rig coordination; you handle health.",
|
||||
"Your vigilance keeps the agents honest.",
|
||||
"Tip: Use 'gt deacon heartbeat' to signal you're alive.",
|
||||
"Every heartbeat you check keeps Gas Town beating.",
|
||||
"Tip: Stale agents need nudging; very stale ones need restarting.",
|
||||
}
|
||||
|
||||
// nextMOTD returns the next MOTD message, rotating through the list
|
||||
// and avoiding consecutive repeats.
|
||||
func (d *Daemon) nextMOTD() string {
|
||||
if len(deaconMOTDMessages) == 0 {
|
||||
return "HEARTBEAT: run your rounds"
|
||||
}
|
||||
|
||||
// Pick a random index that's different from the last one
|
||||
nextIdx := d.lastMOTDIndex
|
||||
for nextIdx == d.lastMOTDIndex && len(deaconMOTDMessages) > 1 {
|
||||
nextIdx = int(time.Now().UnixNano() % int64(len(deaconMOTDMessages)))
|
||||
}
|
||||
d.lastMOTDIndex = nextIdx
|
||||
return deaconMOTDMessages[nextIdx]
|
||||
}
|
||||
|
||||
// ensureDeaconRunning ensures the Deacon is running.
|
||||
// ZFC-compliant: trusts agent bead state, no tmux inference.
|
||||
@@ -285,37 +257,6 @@ func (d *Daemon) ensureDeaconRunning() {
|
||||
d.logger.Println("Deacon session started successfully")
|
||||
}
|
||||
|
||||
// pokeDeacon sends a heartbeat message to the Deacon session.
|
||||
// ZFC-compliant: trusts agent bead state, no tmux inference.
|
||||
// The Deacon molecule decides what to do with heartbeats.
|
||||
func (d *Daemon) pokeDeacon() {
|
||||
// Check agent bead state (ZFC: trust what agent reports)
|
||||
beadState, beadErr := d.getAgentBeadState("gt-deacon")
|
||||
if beadErr != nil || (beadState != "running" && beadState != "working") {
|
||||
// Agent not running per bead - don't poke (ensureDeaconRunning should start it)
|
||||
return
|
||||
}
|
||||
|
||||
// Agent reports running - send heartbeat
|
||||
motd := d.nextMOTD()
|
||||
msg := fmt.Sprintf("HEARTBEAT: %s", motd)
|
||||
if err := d.tmux.SendKeysReplace(DeaconSessionName, msg, 50); err != nil {
|
||||
d.logger.Printf("Error poking Deacon: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
d.logger.Println("Poked Deacon")
|
||||
}
|
||||
|
||||
// witnessMOTDMessages contains rotating tips for witness heartbeats.
|
||||
var witnessMOTDMessages = []string{
|
||||
"Time to patrol! Check your polecats.",
|
||||
"Tip: Survey polecat health via agent beads.",
|
||||
"Tip: Verify git state before killing polecats.",
|
||||
"Your vigilance keeps polecats honest.",
|
||||
"Tip: Escalate stuck workers to Mayor.",
|
||||
"Tip: Send MERGE_READY when work is done.",
|
||||
}
|
||||
|
||||
// ensureWitnessesRunning ensures witnesses are running for all rigs.
|
||||
// Called on each heartbeat to maintain witness patrol loops.
|
||||
@@ -365,38 +306,6 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
||||
d.logger.Printf("Witness session for %s started successfully", rigName)
|
||||
}
|
||||
|
||||
// pokeWitnesses sends heartbeat messages to all witnesses.
|
||||
func (d *Daemon) pokeWitnesses() {
|
||||
rigs := d.getKnownRigs()
|
||||
for _, rigName := range rigs {
|
||||
d.pokeWitness(rigName)
|
||||
}
|
||||
}
|
||||
|
||||
// pokeWitness sends a heartbeat to a specific rig's witness.
|
||||
func (d *Daemon) pokeWitness(rigName string) {
|
||||
agentID := beads.WitnessBeadID(rigName)
|
||||
sessionName := "gt-" + rigName + "-witness"
|
||||
|
||||
// Check agent bead state (ZFC: trust what agent reports)
|
||||
beadState, beadErr := d.getAgentBeadState(agentID)
|
||||
if beadErr != nil || (beadState != "running" && beadState != "working") {
|
||||
// Agent not running per bead - don't poke
|
||||
return
|
||||
}
|
||||
|
||||
// Agent reports running - send heartbeat
|
||||
idx := int(time.Now().UnixNano() % int64(len(witnessMOTDMessages)))
|
||||
motd := witnessMOTDMessages[idx]
|
||||
msg := fmt.Sprintf("HEARTBEAT: %s", motd)
|
||||
|
||||
if err := d.tmux.SendKeysReplace(sessionName, msg, 50); err != nil {
|
||||
d.logger.Printf("Error poking witness for %s: %v", rigName, err)
|
||||
return
|
||||
}
|
||||
|
||||
d.logger.Printf("Poked witness for %s", rigName)
|
||||
}
|
||||
|
||||
// getKnownRigs returns list of registered rig names.
|
||||
func (d *Daemon) getKnownRigs() []string {
|
||||
@@ -406,17 +315,14 @@ func (d *Daemon) getKnownRigs() []string {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Simple extraction - look for rig names in the JSON
|
||||
// Full parsing would require importing config package
|
||||
var rigs []string
|
||||
// Parse just enough to get rig names
|
||||
type rigsJSON struct {
|
||||
var parsed struct {
|
||||
Rigs map[string]interface{} `json:"rigs"`
|
||||
}
|
||||
var parsed rigsJSON
|
||||
if err := json.Unmarshal(data, &parsed); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var rigs []string
|
||||
for name := range parsed.Rigs {
|
||||
rigs = append(rigs, name)
|
||||
}
|
||||
|
||||
@@ -699,3 +699,210 @@ func identityToBDActor(identity string) string {
|
||||
return identity
|
||||
}
|
||||
}
|
||||
|
||||
// GUPPViolationTimeout is how long an agent can have work on hook without
|
||||
// progressing before it's considered a GUPP (Gas Town Universal Propulsion
|
||||
// Principle) violation. GUPP states: if you have work on your hook, you run it.
|
||||
const GUPPViolationTimeout = 30 * time.Minute
|
||||
|
||||
// checkGUPPViolations looks for agents that have work-on-hook but aren't
|
||||
// progressing. This is a GUPP violation: agents with hooked work must execute.
|
||||
// The daemon detects these and notifies the relevant Witness for remediation.
|
||||
func (d *Daemon) checkGUPPViolations() {
|
||||
// Check polecat agents - they're the ones with work-on-hook
|
||||
rigs := d.getKnownRigs()
|
||||
for _, rigName := range rigs {
|
||||
d.checkRigGUPPViolations(rigName)
|
||||
}
|
||||
}
|
||||
|
||||
// checkRigGUPPViolations checks polecats in a specific rig for GUPP violations.
|
||||
func (d *Daemon) checkRigGUPPViolations(rigName string) {
|
||||
// List polecat agent beads for this rig
|
||||
// Pattern: gt-polecat-<rig>-<name>
|
||||
cmd := exec.Command("bd", "list", "--type=agent", "--json")
|
||||
cmd.Dir = d.config.TownRoot
|
||||
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return // Silently fail - bd might not be available
|
||||
}
|
||||
|
||||
var agents []struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"issue_type"`
|
||||
Description string `json:"description"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(output, &agents); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
prefix := "gt-polecat-" + rigName + "-"
|
||||
for _, agent := range agents {
|
||||
// Only check polecats for this rig
|
||||
if !strings.HasPrefix(agent.ID, prefix) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse agent fields
|
||||
fields := beads.ParseAgentFieldsFromDescription(agent.Description)
|
||||
if fields == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if agent has work on hook
|
||||
if fields.HookBead == "" {
|
||||
continue // No hooked work - no GUPP violation possible
|
||||
}
|
||||
|
||||
// Check if agent is actively working
|
||||
if fields.AgentState == "working" || fields.AgentState == "running" {
|
||||
// Check when the agent bead was last updated
|
||||
updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
age := time.Since(updatedAt)
|
||||
if age > GUPPViolationTimeout {
|
||||
d.logger.Printf("GUPP violation: agent %s has hook_bead=%s but hasn't updated in %v (timeout: %v)",
|
||||
agent.ID, fields.HookBead, age.Round(time.Minute), GUPPViolationTimeout)
|
||||
|
||||
// Notify the witness for this rig
|
||||
d.notifyWitnessOfGUPP(rigName, agent.ID, fields.HookBead, age)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// notifyWitnessOfGUPP sends a mail to the rig's witness about a GUPP violation.
|
||||
func (d *Daemon) notifyWitnessOfGUPP(rigName, agentID, hookBead string, stuckDuration time.Duration) {
|
||||
witnessAddr := rigName + "/witness"
|
||||
subject := fmt.Sprintf("GUPP_VIOLATION: %s stuck for %v", agentID, stuckDuration.Round(time.Minute))
|
||||
body := fmt.Sprintf(`Agent %s has work on hook but isn't progressing.
|
||||
|
||||
hook_bead: %s
|
||||
stuck_duration: %v
|
||||
|
||||
Action needed: Check if agent is alive and responsive. Consider restarting if stuck.`,
|
||||
agentID, hookBead, stuckDuration.Round(time.Minute))
|
||||
|
||||
cmd := exec.Command("gt", "mail", "send", witnessAddr, "-s", subject, "-m", body)
|
||||
cmd.Dir = d.config.TownRoot
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
d.logger.Printf("Warning: failed to notify witness of GUPP violation: %v", err)
|
||||
} else {
|
||||
d.logger.Printf("Notified %s of GUPP violation for %s", witnessAddr, agentID)
|
||||
}
|
||||
}
|
||||
|
||||
// checkOrphanedWork looks for work assigned to dead agents.
|
||||
// Orphaned work needs to be reassigned or the agent needs to be restarted.
|
||||
func (d *Daemon) checkOrphanedWork() {
|
||||
// Get list of dead agents
|
||||
deadAgents := d.getDeadAgents()
|
||||
if len(deadAgents) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// For each dead agent, check if they have hooked work
|
||||
for _, agent := range deadAgents {
|
||||
fields := beads.ParseAgentFieldsFromDescription(agent.Description)
|
||||
if fields == nil || fields.HookBead == "" {
|
||||
continue // No hooked work to orphan
|
||||
}
|
||||
|
||||
d.logger.Printf("Orphaned work detected: agent %s is dead but has hook_bead=%s",
|
||||
agent.ID, fields.HookBead)
|
||||
|
||||
// Determine the rig from the agent ID (gt-polecat-<rig>-<name>)
|
||||
rigName := d.extractRigFromAgentID(agent.ID)
|
||||
if rigName != "" {
|
||||
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, fields.HookBead)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// getDeadAgents returns all agent beads with state=dead.
|
||||
func (d *Daemon) getDeadAgents() []struct {
|
||||
ID string
|
||||
Description string
|
||||
} {
|
||||
cmd := exec.Command("bd", "list", "--type=agent", "--json")
|
||||
cmd.Dir = d.config.TownRoot
|
||||
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var agents []struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"issue_type"`
|
||||
Description string `json:"description"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(output, &agents); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var dead []struct {
|
||||
ID string
|
||||
Description string
|
||||
}
|
||||
|
||||
for _, agent := range agents {
|
||||
fields := beads.ParseAgentFieldsFromDescription(agent.Description)
|
||||
if fields != nil && fields.AgentState == "dead" {
|
||||
dead = append(dead, struct {
|
||||
ID string
|
||||
Description string
|
||||
}{agent.ID, agent.Description})
|
||||
}
|
||||
}
|
||||
|
||||
return dead
|
||||
}
|
||||
|
||||
// extractRigFromAgentID extracts the rig name from a polecat agent ID.
|
||||
// Example: gt-polecat-gastown-max → gastown
|
||||
func (d *Daemon) extractRigFromAgentID(agentID string) string {
|
||||
// Pattern: gt-polecat-<rig>-<name>
|
||||
if !strings.HasPrefix(agentID, "gt-polecat-") {
|
||||
return ""
|
||||
}
|
||||
|
||||
rest := strings.TrimPrefix(agentID, "gt-polecat-")
|
||||
// Find the rig name (everything before the last dash)
|
||||
lastDash := strings.LastIndex(rest, "-")
|
||||
if lastDash == -1 {
|
||||
return ""
|
||||
}
|
||||
|
||||
return rest[:lastDash]
|
||||
}
|
||||
|
||||
// notifyWitnessOfOrphanedWork sends a mail to the rig's witness about orphaned work.
|
||||
func (d *Daemon) notifyWitnessOfOrphanedWork(rigName, agentID, hookBead string) {
|
||||
witnessAddr := rigName + "/witness"
|
||||
subject := fmt.Sprintf("ORPHANED_WORK: %s has hooked work but is dead", agentID)
|
||||
body := fmt.Sprintf(`Agent %s is dead but has work on its hook.
|
||||
|
||||
hook_bead: %s
|
||||
|
||||
Action needed: Either restart the agent or reassign the work.`,
|
||||
agentID, hookBead)
|
||||
|
||||
cmd := exec.Command("gt", "mail", "send", witnessAddr, "-s", subject, "-m", body)
|
||||
cmd.Dir = d.config.TownRoot
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
d.logger.Printf("Warning: failed to notify witness of orphaned work: %v", err)
|
||||
} else {
|
||||
d.logger.Printf("Notified %s of orphaned work for %s", witnessAddr, agentID)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user