fix: use tmux for agent liveness in daemon checks (gt-zecmc)

Complete the "discover, don't track" refactoring: - checkGUPPViolations: use tmux.IsClaudeRunning() instead of agent_state - checkOrphanedWork: derive dead agents from tmux, not agent_state=dead - assessStaleness: rely on HasActiveSession (tmux), not agent_state Non-observable states (stuck, awaiting-gate) are still respected. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-06 20:49:49 -08:00
parent 9729e05f86
commit fc4b9de02c
2 changed files with 57 additions and 61 deletions
--- a/internal/daemon/lifecycle.go
+++ b/internal/daemon/lifecycle.go
@@ -759,9 +759,14 @@ func (d *Daemon) checkRigGUPPViolations(rigName string) {
 			continue // No hooked work - no GUPP violation possible
 		}
-		// Check if agent is actively working
+		// Per gt-zecmc: derive running state from tmux, not agent_state
-		if agent.AgentState == "working" || agent.AgentState == "running" {
+		// Extract polecat name from agent ID (gt-polecat-<rig>-<name> -> <name>)
-			// Check when the agent bead was last updated
+		polecatName := strings.TrimPrefix(agent.ID, prefix)
 		sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
 		// Check if tmux session exists and Claude is running
 		if d.tmux.IsClaudeRunning(sessionName) {
 			// Session is alive - check if it's been stuck too long
 			updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt)
 			if err != nil {
 				continue
@@ -803,69 +808,61 @@ Action needed: Check if agent is alive and responsive. Consider restarting if st
 // checkOrphanedWork looks for work assigned to dead agents.
 // Orphaned work needs to be reassigned or the agent needs to be restarted.
 // Per gt-zecmc: derive agent liveness from tmux, not agent_state.
 func (d *Daemon) checkOrphanedWork() {
-	// Get list of dead agents
+	// Check all polecat agents with hooked work
-	deadAgents := d.getDeadAgents()
+	rigs := d.getKnownRigs()
-	if len(deadAgents) == 0 {
+	for _, rigName := range rigs {
-		return
+		d.checkRigOrphanedWork(rigName)
 	}
 	// For each dead agent, check if they have hooked work
 	// Use HookBead from database column directly (not parsed from description)
 	for _, agent := range deadAgents {
 		if agent.HookBead == "" {
 			continue // No hooked work to orphan
 		}
 		d.logger.Printf("Orphaned work detected: agent %s is dead but has hook_bead=%s",
 			agent.ID, agent.HookBead)
 		// Determine the rig from the agent ID (gt-polecat-<rig>-<name>)
 		rigName := d.extractRigFromAgentID(agent.ID)
 		if rigName != "" {
 			d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
 		}
 	}
 }
-// deadAgentInfo holds info about a dead agent for orphaned work detection.
+// checkRigOrphanedWork checks polecats in a specific rig for orphaned work.
-type deadAgentInfo struct {
+func (d *Daemon) checkRigOrphanedWork(rigName string) {
 	ID       string
 	HookBead string // Read from database column, not description
 }
 // getDeadAgents returns all agent beads with state=dead.
 func (d *Daemon) getDeadAgents() []deadAgentInfo {
 	cmd := exec.Command("bd", "list", "--type=agent", "--json")
 	cmd.Dir = d.config.TownRoot
 	output, err := cmd.Output()
 	if err != nil {
-		return nil
+		return
 	}
 	var agents []struct {
 		ID       string `json:"id"`
-		Type       string `json:"issue_type"`
+		HookBead string `json:"hook_bead"`
 		HookBead   string `json:"hook_bead"`   // Read from database column
 		AgentState string `json:"agent_state"` // Read from database column
 	}
 	if err := json.Unmarshal(output, &agents); err != nil {
-		return nil
+		return
 	}
-	var dead []deadAgentInfo
+	prefix := "gt-polecat-" + rigName + "-"
 	for _, agent := range agents {
-		if agent.AgentState == "dead" {
+		// Only check polecats for this rig
-			dead = append(dead, deadAgentInfo{
+		if !strings.HasPrefix(agent.ID, prefix) {
-				ID:       agent.ID,
+			continue
 				HookBead: agent.HookBead,
 			})
 		}
 		}
-	return dead
+		// No hooked work = nothing to orphan
 		if agent.HookBead == "" {
 			continue
 		}
 		// Check if tmux session is alive (derive state from tmux, not bead)
 		polecatName := strings.TrimPrefix(agent.ID, prefix)
 		sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
 		// Session running = not orphaned (work is being processed)
 		if d.tmux.IsClaudeRunning(sessionName) {
 			continue
 		}
 		// Session dead but has hooked work = orphaned!
 		d.logger.Printf("Orphaned work detected: agent %s session is dead but has hook_bead=%s",
 			agent.ID, agent.HookBead)
 		d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
 	}
 }
 // extractRigFromAgentID extracts the rig name from a polecat agent ID.
--- a/internal/polecat/manager.go
+++ b/internal/polecat/manager.go
@@ -866,40 +866,39 @@ func countCommitsBehind(g *git.Git, defaultBranch string) int {
 }
 // assessStaleness determines if a polecat should be cleaned up.
 // Per gt-zecmc: uses tmux state (HasActiveSession) rather than agent_state
 // since observable states (running, done, idle) are no longer recorded in beads.
 func assessStaleness(info *StalenessInfo, threshold int) (bool, string) {
 	// Never clean up if there's uncommitted work
 	if info.HasUncommittedWork {
 		return false, "has uncommitted work"
 	}
-	// If session is active, not stale
+	// If session is active, not stale (tmux is source of truth for liveness)
 	if info.HasActiveSession {
 		return false, "session active"
 	}
-	// No active session - check other indicators
+	// No active session - this polecat is a cleanup candidate
 	// Check for reasons to keep it:
-	// If agent reports "running" state but no session, that's suspicious
+	// Check for non-observable states that indicate intentional pause
-	// but give benefit of doubt (session may have just died)
+	// (stuck, awaiting-gate are still stored in beads per gt-zecmc)
-	if info.AgentState == "running" {
+	if info.AgentState == "stuck" || info.AgentState == "awaiting-gate" {
-		return false, "agent reports running (session may be restarting)"
+		return false, fmt.Sprintf("agent_state=%s (intentional pause)", info.AgentState)
 	}
-	// If agent reports "done" or "idle", it's a cleanup candidate
+	// No session and way behind main = stale
 	if info.AgentState == "done" || info.AgentState == "idle" {
 		return true, fmt.Sprintf("agent_state=%s, no active session", info.AgentState)
 	}
 	// Way behind main is a strong staleness signal
 	if info.CommitsBehind >= threshold {
 		return true, fmt.Sprintf("%d commits behind main, no active session", info.CommitsBehind)
 	}
-	// No agent bead and no session - likely abandoned
+	// No session and no agent bead = abandoned, clean up
 	if info.AgentState == "" {
 		return true, "no agent bead, no active session"
 	}
-	// Default: not enough evidence to consider stale
+	// No session but has agent bead without special state = clean up
-	return false, "insufficient staleness indicators"
+	// (The session is the source of truth for liveness)
 	return true, "no active session"
 }