From fc4b9de02cead3b3857ac392330abff0df667927 Mon Sep 17 00:00:00 2001 From: gastown/crew/joe Date: Tue, 6 Jan 2026 20:49:49 -0800 Subject: [PATCH] fix: use tmux for agent liveness in daemon checks (gt-zecmc) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete the "discover, don't track" refactoring: - checkGUPPViolations: use tmux.IsClaudeRunning() instead of agent_state - checkOrphanedWork: derive dead agents from tmux, not agent_state=dead - assessStaleness: rely on HasActiveSession (tmux), not agent_state Non-observable states (stuck, awaiting-gate) are still respected. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal/daemon/lifecycle.go | 89 +++++++++++++++++------------------- internal/polecat/manager.go | 29 ++++++------ 2 files changed, 57 insertions(+), 61 deletions(-) diff --git a/internal/daemon/lifecycle.go b/internal/daemon/lifecycle.go index 171b07f6..6950e26d 100644 --- a/internal/daemon/lifecycle.go +++ b/internal/daemon/lifecycle.go @@ -759,9 +759,14 @@ func (d *Daemon) checkRigGUPPViolations(rigName string) { continue // No hooked work - no GUPP violation possible } - // Check if agent is actively working - if agent.AgentState == "working" || agent.AgentState == "running" { - // Check when the agent bead was last updated + // Per gt-zecmc: derive running state from tmux, not agent_state + // Extract polecat name from agent ID (gt-polecat-- -> ) + polecatName := strings.TrimPrefix(agent.ID, prefix) + sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName) + + // Check if tmux session exists and Claude is running + if d.tmux.IsClaudeRunning(sessionName) { + // Session is alive - check if it's been stuck too long updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt) if err != nil { continue @@ -803,69 +808,61 @@ Action needed: Check if agent is alive and responsive. Consider restarting if st // checkOrphanedWork looks for work assigned to dead agents. // Orphaned work needs to be reassigned or the agent needs to be restarted. +// Per gt-zecmc: derive agent liveness from tmux, not agent_state. func (d *Daemon) checkOrphanedWork() { - // Get list of dead agents - deadAgents := d.getDeadAgents() - if len(deadAgents) == 0 { - return - } - - // For each dead agent, check if they have hooked work - // Use HookBead from database column directly (not parsed from description) - for _, agent := range deadAgents { - if agent.HookBead == "" { - continue // No hooked work to orphan - } - - d.logger.Printf("Orphaned work detected: agent %s is dead but has hook_bead=%s", - agent.ID, agent.HookBead) - - // Determine the rig from the agent ID (gt-polecat--) - rigName := d.extractRigFromAgentID(agent.ID) - if rigName != "" { - d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead) - } + // Check all polecat agents with hooked work + rigs := d.getKnownRigs() + for _, rigName := range rigs { + d.checkRigOrphanedWork(rigName) } } -// deadAgentInfo holds info about a dead agent for orphaned work detection. -type deadAgentInfo struct { - ID string - HookBead string // Read from database column, not description -} - -// getDeadAgents returns all agent beads with state=dead. -func (d *Daemon) getDeadAgents() []deadAgentInfo { +// checkRigOrphanedWork checks polecats in a specific rig for orphaned work. +func (d *Daemon) checkRigOrphanedWork(rigName string) { cmd := exec.Command("bd", "list", "--type=agent", "--json") cmd.Dir = d.config.TownRoot output, err := cmd.Output() if err != nil { - return nil + return } var agents []struct { - ID string `json:"id"` - Type string `json:"issue_type"` - HookBead string `json:"hook_bead"` // Read from database column - AgentState string `json:"agent_state"` // Read from database column + ID string `json:"id"` + HookBead string `json:"hook_bead"` } if err := json.Unmarshal(output, &agents); err != nil { - return nil + return } - var dead []deadAgentInfo + prefix := "gt-polecat-" + rigName + "-" for _, agent := range agents { - if agent.AgentState == "dead" { - dead = append(dead, deadAgentInfo{ - ID: agent.ID, - HookBead: agent.HookBead, - }) + // Only check polecats for this rig + if !strings.HasPrefix(agent.ID, prefix) { + continue } - } - return dead + // No hooked work = nothing to orphan + if agent.HookBead == "" { + continue + } + + // Check if tmux session is alive (derive state from tmux, not bead) + polecatName := strings.TrimPrefix(agent.ID, prefix) + sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName) + + // Session running = not orphaned (work is being processed) + if d.tmux.IsClaudeRunning(sessionName) { + continue + } + + // Session dead but has hooked work = orphaned! + d.logger.Printf("Orphaned work detected: agent %s session is dead but has hook_bead=%s", + agent.ID, agent.HookBead) + + d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead) + } } // extractRigFromAgentID extracts the rig name from a polecat agent ID. diff --git a/internal/polecat/manager.go b/internal/polecat/manager.go index cb09494f..7c0b9d88 100644 --- a/internal/polecat/manager.go +++ b/internal/polecat/manager.go @@ -866,40 +866,39 @@ func countCommitsBehind(g *git.Git, defaultBranch string) int { } // assessStaleness determines if a polecat should be cleaned up. +// Per gt-zecmc: uses tmux state (HasActiveSession) rather than agent_state +// since observable states (running, done, idle) are no longer recorded in beads. func assessStaleness(info *StalenessInfo, threshold int) (bool, string) { // Never clean up if there's uncommitted work if info.HasUncommittedWork { return false, "has uncommitted work" } - // If session is active, not stale + // If session is active, not stale (tmux is source of truth for liveness) if info.HasActiveSession { return false, "session active" } - // No active session - check other indicators + // No active session - this polecat is a cleanup candidate + // Check for reasons to keep it: - // If agent reports "running" state but no session, that's suspicious - // but give benefit of doubt (session may have just died) - if info.AgentState == "running" { - return false, "agent reports running (session may be restarting)" + // Check for non-observable states that indicate intentional pause + // (stuck, awaiting-gate are still stored in beads per gt-zecmc) + if info.AgentState == "stuck" || info.AgentState == "awaiting-gate" { + return false, fmt.Sprintf("agent_state=%s (intentional pause)", info.AgentState) } - // If agent reports "done" or "idle", it's a cleanup candidate - if info.AgentState == "done" || info.AgentState == "idle" { - return true, fmt.Sprintf("agent_state=%s, no active session", info.AgentState) - } - - // Way behind main is a strong staleness signal + // No session and way behind main = stale if info.CommitsBehind >= threshold { return true, fmt.Sprintf("%d commits behind main, no active session", info.CommitsBehind) } - // No agent bead and no session - likely abandoned + // No session and no agent bead = abandoned, clean up if info.AgentState == "" { return true, "no agent bead, no active session" } - // Default: not enough evidence to consider stale - return false, "insufficient staleness indicators" + // No session but has agent bead without special state = clean up + // (The session is the source of truth for liveness) + return true, "no active session" }