fix: use tmux for agent liveness in daemon checks (gt-zecmc)

Complete the "discover, don't track" refactoring:

- checkGUPPViolations: use tmux.IsClaudeRunning() instead of agent_state
- checkOrphanedWork: derive dead agents from tmux, not agent_state=dead
- assessStaleness: rely on HasActiveSession (tmux), not agent_state

Non-observable states (stuck, awaiting-gate) are still respected.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gastown/crew/joe
2026-01-06 20:49:49 -08:00
committed by Steve Yegge
parent 9729e05f86
commit fc4b9de02c
2 changed files with 57 additions and 61 deletions

View File

@@ -759,9 +759,14 @@ func (d *Daemon) checkRigGUPPViolations(rigName string) {
continue // No hooked work - no GUPP violation possible continue // No hooked work - no GUPP violation possible
} }
// Check if agent is actively working // Per gt-zecmc: derive running state from tmux, not agent_state
if agent.AgentState == "working" || agent.AgentState == "running" { // Extract polecat name from agent ID (gt-polecat-<rig>-<name> -> <name>)
// Check when the agent bead was last updated polecatName := strings.TrimPrefix(agent.ID, prefix)
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
// Check if tmux session exists and Claude is running
if d.tmux.IsClaudeRunning(sessionName) {
// Session is alive - check if it's been stuck too long
updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt) updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt)
if err != nil { if err != nil {
continue continue
@@ -803,69 +808,61 @@ Action needed: Check if agent is alive and responsive. Consider restarting if st
// checkOrphanedWork looks for work assigned to dead agents. // checkOrphanedWork looks for work assigned to dead agents.
// Orphaned work needs to be reassigned or the agent needs to be restarted. // Orphaned work needs to be reassigned or the agent needs to be restarted.
// Per gt-zecmc: derive agent liveness from tmux, not agent_state.
func (d *Daemon) checkOrphanedWork() { func (d *Daemon) checkOrphanedWork() {
// Get list of dead agents // Check all polecat agents with hooked work
deadAgents := d.getDeadAgents() rigs := d.getKnownRigs()
if len(deadAgents) == 0 { for _, rigName := range rigs {
return d.checkRigOrphanedWork(rigName)
}
// For each dead agent, check if they have hooked work
// Use HookBead from database column directly (not parsed from description)
for _, agent := range deadAgents {
if agent.HookBead == "" {
continue // No hooked work to orphan
}
d.logger.Printf("Orphaned work detected: agent %s is dead but has hook_bead=%s",
agent.ID, agent.HookBead)
// Determine the rig from the agent ID (gt-polecat-<rig>-<name>)
rigName := d.extractRigFromAgentID(agent.ID)
if rigName != "" {
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
}
} }
} }
// deadAgentInfo holds info about a dead agent for orphaned work detection. // checkRigOrphanedWork checks polecats in a specific rig for orphaned work.
type deadAgentInfo struct { func (d *Daemon) checkRigOrphanedWork(rigName string) {
ID string
HookBead string // Read from database column, not description
}
// getDeadAgents returns all agent beads with state=dead.
func (d *Daemon) getDeadAgents() []deadAgentInfo {
cmd := exec.Command("bd", "list", "--type=agent", "--json") cmd := exec.Command("bd", "list", "--type=agent", "--json")
cmd.Dir = d.config.TownRoot cmd.Dir = d.config.TownRoot
output, err := cmd.Output() output, err := cmd.Output()
if err != nil { if err != nil {
return nil return
} }
var agents []struct { var agents []struct {
ID string `json:"id"` ID string `json:"id"`
Type string `json:"issue_type"` HookBead string `json:"hook_bead"`
HookBead string `json:"hook_bead"` // Read from database column
AgentState string `json:"agent_state"` // Read from database column
} }
if err := json.Unmarshal(output, &agents); err != nil { if err := json.Unmarshal(output, &agents); err != nil {
return nil return
} }
var dead []deadAgentInfo prefix := "gt-polecat-" + rigName + "-"
for _, agent := range agents { for _, agent := range agents {
if agent.AgentState == "dead" { // Only check polecats for this rig
dead = append(dead, deadAgentInfo{ if !strings.HasPrefix(agent.ID, prefix) {
ID: agent.ID, continue
HookBead: agent.HookBead,
})
}
} }
return dead // No hooked work = nothing to orphan
if agent.HookBead == "" {
continue
}
// Check if tmux session is alive (derive state from tmux, not bead)
polecatName := strings.TrimPrefix(agent.ID, prefix)
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
// Session running = not orphaned (work is being processed)
if d.tmux.IsClaudeRunning(sessionName) {
continue
}
// Session dead but has hooked work = orphaned!
d.logger.Printf("Orphaned work detected: agent %s session is dead but has hook_bead=%s",
agent.ID, agent.HookBead)
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
}
} }
// extractRigFromAgentID extracts the rig name from a polecat agent ID. // extractRigFromAgentID extracts the rig name from a polecat agent ID.

View File

@@ -866,40 +866,39 @@ func countCommitsBehind(g *git.Git, defaultBranch string) int {
} }
// assessStaleness determines if a polecat should be cleaned up. // assessStaleness determines if a polecat should be cleaned up.
// Per gt-zecmc: uses tmux state (HasActiveSession) rather than agent_state
// since observable states (running, done, idle) are no longer recorded in beads.
func assessStaleness(info *StalenessInfo, threshold int) (bool, string) { func assessStaleness(info *StalenessInfo, threshold int) (bool, string) {
// Never clean up if there's uncommitted work // Never clean up if there's uncommitted work
if info.HasUncommittedWork { if info.HasUncommittedWork {
return false, "has uncommitted work" return false, "has uncommitted work"
} }
// If session is active, not stale // If session is active, not stale (tmux is source of truth for liveness)
if info.HasActiveSession { if info.HasActiveSession {
return false, "session active" return false, "session active"
} }
// No active session - check other indicators // No active session - this polecat is a cleanup candidate
// Check for reasons to keep it:
// If agent reports "running" state but no session, that's suspicious // Check for non-observable states that indicate intentional pause
// but give benefit of doubt (session may have just died) // (stuck, awaiting-gate are still stored in beads per gt-zecmc)
if info.AgentState == "running" { if info.AgentState == "stuck" || info.AgentState == "awaiting-gate" {
return false, "agent reports running (session may be restarting)" return false, fmt.Sprintf("agent_state=%s (intentional pause)", info.AgentState)
} }
// If agent reports "done" or "idle", it's a cleanup candidate // No session and way behind main = stale
if info.AgentState == "done" || info.AgentState == "idle" {
return true, fmt.Sprintf("agent_state=%s, no active session", info.AgentState)
}
// Way behind main is a strong staleness signal
if info.CommitsBehind >= threshold { if info.CommitsBehind >= threshold {
return true, fmt.Sprintf("%d commits behind main, no active session", info.CommitsBehind) return true, fmt.Sprintf("%d commits behind main, no active session", info.CommitsBehind)
} }
// No agent bead and no session - likely abandoned // No session and no agent bead = abandoned, clean up
if info.AgentState == "" { if info.AgentState == "" {
return true, "no agent bead, no active session" return true, "no agent bead, no active session"
} }
// Default: not enough evidence to consider stale // No session but has agent bead without special state = clean up
return false, "insufficient staleness indicators" // (The session is the source of truth for liveness)
return true, "no active session"
} }