fix: use tmux for agent liveness in daemon checks (gt-zecmc)

Complete the "discover, don't track" refactoring:

- checkGUPPViolations: use tmux.IsClaudeRunning() instead of agent_state
- checkOrphanedWork: derive dead agents from tmux, not agent_state=dead
- assessStaleness: rely on HasActiveSession (tmux), not agent_state

Non-observable states (stuck, awaiting-gate) are still respected.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gastown/crew/joe
2026-01-06 20:49:49 -08:00
committed by Steve Yegge
parent 9729e05f86
commit fc4b9de02c
2 changed files with 57 additions and 61 deletions

View File

@@ -759,9 +759,14 @@ func (d *Daemon) checkRigGUPPViolations(rigName string) {
continue // No hooked work - no GUPP violation possible
}
// Check if agent is actively working
if agent.AgentState == "working" || agent.AgentState == "running" {
// Check when the agent bead was last updated
// Per gt-zecmc: derive running state from tmux, not agent_state
// Extract polecat name from agent ID (gt-polecat-<rig>-<name> -> <name>)
polecatName := strings.TrimPrefix(agent.ID, prefix)
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
// Check if tmux session exists and Claude is running
if d.tmux.IsClaudeRunning(sessionName) {
// Session is alive - check if it's been stuck too long
updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt)
if err != nil {
continue
@@ -803,69 +808,61 @@ Action needed: Check if agent is alive and responsive. Consider restarting if st
// checkOrphanedWork looks for work assigned to dead agents.
// Orphaned work needs to be reassigned or the agent needs to be restarted.
// Per gt-zecmc: derive agent liveness from tmux, not agent_state.
func (d *Daemon) checkOrphanedWork() {
// Get list of dead agents
deadAgents := d.getDeadAgents()
if len(deadAgents) == 0 {
return
}
// For each dead agent, check if they have hooked work
// Use HookBead from database column directly (not parsed from description)
for _, agent := range deadAgents {
if agent.HookBead == "" {
continue // No hooked work to orphan
}
d.logger.Printf("Orphaned work detected: agent %s is dead but has hook_bead=%s",
agent.ID, agent.HookBead)
// Determine the rig from the agent ID (gt-polecat-<rig>-<name>)
rigName := d.extractRigFromAgentID(agent.ID)
if rigName != "" {
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
}
// Check all polecat agents with hooked work
rigs := d.getKnownRigs()
for _, rigName := range rigs {
d.checkRigOrphanedWork(rigName)
}
}
// deadAgentInfo holds info about a dead agent for orphaned work detection.
type deadAgentInfo struct {
ID string
HookBead string // Read from database column, not description
}
// getDeadAgents returns all agent beads with state=dead.
func (d *Daemon) getDeadAgents() []deadAgentInfo {
// checkRigOrphanedWork checks polecats in a specific rig for orphaned work.
func (d *Daemon) checkRigOrphanedWork(rigName string) {
cmd := exec.Command("bd", "list", "--type=agent", "--json")
cmd.Dir = d.config.TownRoot
output, err := cmd.Output()
if err != nil {
return nil
return
}
var agents []struct {
ID string `json:"id"`
Type string `json:"issue_type"`
HookBead string `json:"hook_bead"` // Read from database column
AgentState string `json:"agent_state"` // Read from database column
ID string `json:"id"`
HookBead string `json:"hook_bead"`
}
if err := json.Unmarshal(output, &agents); err != nil {
return nil
return
}
var dead []deadAgentInfo
prefix := "gt-polecat-" + rigName + "-"
for _, agent := range agents {
if agent.AgentState == "dead" {
dead = append(dead, deadAgentInfo{
ID: agent.ID,
HookBead: agent.HookBead,
})
// Only check polecats for this rig
if !strings.HasPrefix(agent.ID, prefix) {
continue
}
}
return dead
// No hooked work = nothing to orphan
if agent.HookBead == "" {
continue
}
// Check if tmux session is alive (derive state from tmux, not bead)
polecatName := strings.TrimPrefix(agent.ID, prefix)
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
// Session running = not orphaned (work is being processed)
if d.tmux.IsClaudeRunning(sessionName) {
continue
}
// Session dead but has hooked work = orphaned!
d.logger.Printf("Orphaned work detected: agent %s session is dead but has hook_bead=%s",
agent.ID, agent.HookBead)
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
}
}
// extractRigFromAgentID extracts the rig name from a polecat agent ID.

View File

@@ -866,40 +866,39 @@ func countCommitsBehind(g *git.Git, defaultBranch string) int {
}
// assessStaleness determines if a polecat should be cleaned up.
// Per gt-zecmc: uses tmux state (HasActiveSession) rather than agent_state
// since observable states (running, done, idle) are no longer recorded in beads.
func assessStaleness(info *StalenessInfo, threshold int) (bool, string) {
// Never clean up if there's uncommitted work
if info.HasUncommittedWork {
return false, "has uncommitted work"
}
// If session is active, not stale
// If session is active, not stale (tmux is source of truth for liveness)
if info.HasActiveSession {
return false, "session active"
}
// No active session - check other indicators
// No active session - this polecat is a cleanup candidate
// Check for reasons to keep it:
// If agent reports "running" state but no session, that's suspicious
// but give benefit of doubt (session may have just died)
if info.AgentState == "running" {
return false, "agent reports running (session may be restarting)"
// Check for non-observable states that indicate intentional pause
// (stuck, awaiting-gate are still stored in beads per gt-zecmc)
if info.AgentState == "stuck" || info.AgentState == "awaiting-gate" {
return false, fmt.Sprintf("agent_state=%s (intentional pause)", info.AgentState)
}
// If agent reports "done" or "idle", it's a cleanup candidate
if info.AgentState == "done" || info.AgentState == "idle" {
return true, fmt.Sprintf("agent_state=%s, no active session", info.AgentState)
}
// Way behind main is a strong staleness signal
// No session and way behind main = stale
if info.CommitsBehind >= threshold {
return true, fmt.Sprintf("%d commits behind main, no active session", info.CommitsBehind)
}
// No agent bead and no session - likely abandoned
// No session and no agent bead = abandoned, clean up
if info.AgentState == "" {
return true, "no agent bead, no active session"
}
// Default: not enough evidence to consider stale
return false, "insufficient staleness indicators"
// No session but has agent bead without special state = clean up
// (The session is the source of truth for liveness)
return true, "no active session"
}