fix: use tmux for agent liveness in daemon checks (gt-zecmc)
Complete the "discover, don't track" refactoring: - checkGUPPViolations: use tmux.IsClaudeRunning() instead of agent_state - checkOrphanedWork: derive dead agents from tmux, not agent_state=dead - assessStaleness: rely on HasActiveSession (tmux), not agent_state Non-observable states (stuck, awaiting-gate) are still respected. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
9729e05f86
commit
fc4b9de02c
@@ -759,9 +759,14 @@ func (d *Daemon) checkRigGUPPViolations(rigName string) {
|
||||
continue // No hooked work - no GUPP violation possible
|
||||
}
|
||||
|
||||
// Check if agent is actively working
|
||||
if agent.AgentState == "working" || agent.AgentState == "running" {
|
||||
// Check when the agent bead was last updated
|
||||
// Per gt-zecmc: derive running state from tmux, not agent_state
|
||||
// Extract polecat name from agent ID (gt-polecat-<rig>-<name> -> <name>)
|
||||
polecatName := strings.TrimPrefix(agent.ID, prefix)
|
||||
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
|
||||
|
||||
// Check if tmux session exists and Claude is running
|
||||
if d.tmux.IsClaudeRunning(sessionName) {
|
||||
// Session is alive - check if it's been stuck too long
|
||||
updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt)
|
||||
if err != nil {
|
||||
continue
|
||||
@@ -803,69 +808,61 @@ Action needed: Check if agent is alive and responsive. Consider restarting if st
|
||||
|
||||
// checkOrphanedWork looks for work assigned to dead agents.
|
||||
// Orphaned work needs to be reassigned or the agent needs to be restarted.
|
||||
// Per gt-zecmc: derive agent liveness from tmux, not agent_state.
|
||||
func (d *Daemon) checkOrphanedWork() {
|
||||
// Get list of dead agents
|
||||
deadAgents := d.getDeadAgents()
|
||||
if len(deadAgents) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// For each dead agent, check if they have hooked work
|
||||
// Use HookBead from database column directly (not parsed from description)
|
||||
for _, agent := range deadAgents {
|
||||
if agent.HookBead == "" {
|
||||
continue // No hooked work to orphan
|
||||
}
|
||||
|
||||
d.logger.Printf("Orphaned work detected: agent %s is dead but has hook_bead=%s",
|
||||
agent.ID, agent.HookBead)
|
||||
|
||||
// Determine the rig from the agent ID (gt-polecat-<rig>-<name>)
|
||||
rigName := d.extractRigFromAgentID(agent.ID)
|
||||
if rigName != "" {
|
||||
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
|
||||
}
|
||||
// Check all polecat agents with hooked work
|
||||
rigs := d.getKnownRigs()
|
||||
for _, rigName := range rigs {
|
||||
d.checkRigOrphanedWork(rigName)
|
||||
}
|
||||
}
|
||||
|
||||
// deadAgentInfo holds info about a dead agent for orphaned work detection.
|
||||
type deadAgentInfo struct {
|
||||
ID string
|
||||
HookBead string // Read from database column, not description
|
||||
}
|
||||
|
||||
// getDeadAgents returns all agent beads with state=dead.
|
||||
func (d *Daemon) getDeadAgents() []deadAgentInfo {
|
||||
// checkRigOrphanedWork checks polecats in a specific rig for orphaned work.
|
||||
func (d *Daemon) checkRigOrphanedWork(rigName string) {
|
||||
cmd := exec.Command("bd", "list", "--type=agent", "--json")
|
||||
cmd.Dir = d.config.TownRoot
|
||||
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
return
|
||||
}
|
||||
|
||||
var agents []struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"issue_type"`
|
||||
HookBead string `json:"hook_bead"` // Read from database column
|
||||
AgentState string `json:"agent_state"` // Read from database column
|
||||
ID string `json:"id"`
|
||||
HookBead string `json:"hook_bead"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(output, &agents); err != nil {
|
||||
return nil
|
||||
return
|
||||
}
|
||||
|
||||
var dead []deadAgentInfo
|
||||
prefix := "gt-polecat-" + rigName + "-"
|
||||
for _, agent := range agents {
|
||||
if agent.AgentState == "dead" {
|
||||
dead = append(dead, deadAgentInfo{
|
||||
ID: agent.ID,
|
||||
HookBead: agent.HookBead,
|
||||
})
|
||||
// Only check polecats for this rig
|
||||
if !strings.HasPrefix(agent.ID, prefix) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
return dead
|
||||
// No hooked work = nothing to orphan
|
||||
if agent.HookBead == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if tmux session is alive (derive state from tmux, not bead)
|
||||
polecatName := strings.TrimPrefix(agent.ID, prefix)
|
||||
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
|
||||
|
||||
// Session running = not orphaned (work is being processed)
|
||||
if d.tmux.IsClaudeRunning(sessionName) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Session dead but has hooked work = orphaned!
|
||||
d.logger.Printf("Orphaned work detected: agent %s session is dead but has hook_bead=%s",
|
||||
agent.ID, agent.HookBead)
|
||||
|
||||
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
|
||||
}
|
||||
}
|
||||
|
||||
// extractRigFromAgentID extracts the rig name from a polecat agent ID.
|
||||
|
||||
@@ -866,40 +866,39 @@ func countCommitsBehind(g *git.Git, defaultBranch string) int {
|
||||
}
|
||||
|
||||
// assessStaleness determines if a polecat should be cleaned up.
|
||||
// Per gt-zecmc: uses tmux state (HasActiveSession) rather than agent_state
|
||||
// since observable states (running, done, idle) are no longer recorded in beads.
|
||||
func assessStaleness(info *StalenessInfo, threshold int) (bool, string) {
|
||||
// Never clean up if there's uncommitted work
|
||||
if info.HasUncommittedWork {
|
||||
return false, "has uncommitted work"
|
||||
}
|
||||
|
||||
// If session is active, not stale
|
||||
// If session is active, not stale (tmux is source of truth for liveness)
|
||||
if info.HasActiveSession {
|
||||
return false, "session active"
|
||||
}
|
||||
|
||||
// No active session - check other indicators
|
||||
// No active session - this polecat is a cleanup candidate
|
||||
// Check for reasons to keep it:
|
||||
|
||||
// If agent reports "running" state but no session, that's suspicious
|
||||
// but give benefit of doubt (session may have just died)
|
||||
if info.AgentState == "running" {
|
||||
return false, "agent reports running (session may be restarting)"
|
||||
// Check for non-observable states that indicate intentional pause
|
||||
// (stuck, awaiting-gate are still stored in beads per gt-zecmc)
|
||||
if info.AgentState == "stuck" || info.AgentState == "awaiting-gate" {
|
||||
return false, fmt.Sprintf("agent_state=%s (intentional pause)", info.AgentState)
|
||||
}
|
||||
|
||||
// If agent reports "done" or "idle", it's a cleanup candidate
|
||||
if info.AgentState == "done" || info.AgentState == "idle" {
|
||||
return true, fmt.Sprintf("agent_state=%s, no active session", info.AgentState)
|
||||
}
|
||||
|
||||
// Way behind main is a strong staleness signal
|
||||
// No session and way behind main = stale
|
||||
if info.CommitsBehind >= threshold {
|
||||
return true, fmt.Sprintf("%d commits behind main, no active session", info.CommitsBehind)
|
||||
}
|
||||
|
||||
// No agent bead and no session - likely abandoned
|
||||
// No session and no agent bead = abandoned, clean up
|
||||
if info.AgentState == "" {
|
||||
return true, "no agent bead, no active session"
|
||||
}
|
||||
|
||||
// Default: not enough evidence to consider stale
|
||||
return false, "insufficient staleness indicators"
|
||||
// No session but has agent bead without special state = clean up
|
||||
// (The session is the source of truth for liveness)
|
||||
return true, "no active session"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user