fix: use tmux for agent liveness in daemon checks (gt-zecmc)
Complete the "discover, don't track" refactoring: - checkGUPPViolations: use tmux.IsClaudeRunning() instead of agent_state - checkOrphanedWork: derive dead agents from tmux, not agent_state=dead - assessStaleness: rely on HasActiveSession (tmux), not agent_state Non-observable states (stuck, awaiting-gate) are still respected. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
9729e05f86
commit
fc4b9de02c
@@ -759,9 +759,14 @@ func (d *Daemon) checkRigGUPPViolations(rigName string) {
|
|||||||
continue // No hooked work - no GUPP violation possible
|
continue // No hooked work - no GUPP violation possible
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if agent is actively working
|
// Per gt-zecmc: derive running state from tmux, not agent_state
|
||||||
if agent.AgentState == "working" || agent.AgentState == "running" {
|
// Extract polecat name from agent ID (gt-polecat-<rig>-<name> -> <name>)
|
||||||
// Check when the agent bead was last updated
|
polecatName := strings.TrimPrefix(agent.ID, prefix)
|
||||||
|
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
|
||||||
|
|
||||||
|
// Check if tmux session exists and Claude is running
|
||||||
|
if d.tmux.IsClaudeRunning(sessionName) {
|
||||||
|
// Session is alive - check if it's been stuck too long
|
||||||
updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt)
|
updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
@@ -803,69 +808,61 @@ Action needed: Check if agent is alive and responsive. Consider restarting if st
|
|||||||
|
|
||||||
// checkOrphanedWork looks for work assigned to dead agents.
|
// checkOrphanedWork looks for work assigned to dead agents.
|
||||||
// Orphaned work needs to be reassigned or the agent needs to be restarted.
|
// Orphaned work needs to be reassigned or the agent needs to be restarted.
|
||||||
|
// Per gt-zecmc: derive agent liveness from tmux, not agent_state.
|
||||||
func (d *Daemon) checkOrphanedWork() {
|
func (d *Daemon) checkOrphanedWork() {
|
||||||
// Get list of dead agents
|
// Check all polecat agents with hooked work
|
||||||
deadAgents := d.getDeadAgents()
|
rigs := d.getKnownRigs()
|
||||||
if len(deadAgents) == 0 {
|
for _, rigName := range rigs {
|
||||||
return
|
d.checkRigOrphanedWork(rigName)
|
||||||
}
|
|
||||||
|
|
||||||
// For each dead agent, check if they have hooked work
|
|
||||||
// Use HookBead from database column directly (not parsed from description)
|
|
||||||
for _, agent := range deadAgents {
|
|
||||||
if agent.HookBead == "" {
|
|
||||||
continue // No hooked work to orphan
|
|
||||||
}
|
|
||||||
|
|
||||||
d.logger.Printf("Orphaned work detected: agent %s is dead but has hook_bead=%s",
|
|
||||||
agent.ID, agent.HookBead)
|
|
||||||
|
|
||||||
// Determine the rig from the agent ID (gt-polecat-<rig>-<name>)
|
|
||||||
rigName := d.extractRigFromAgentID(agent.ID)
|
|
||||||
if rigName != "" {
|
|
||||||
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// deadAgentInfo holds info about a dead agent for orphaned work detection.
|
// checkRigOrphanedWork checks polecats in a specific rig for orphaned work.
|
||||||
type deadAgentInfo struct {
|
func (d *Daemon) checkRigOrphanedWork(rigName string) {
|
||||||
ID string
|
|
||||||
HookBead string // Read from database column, not description
|
|
||||||
}
|
|
||||||
|
|
||||||
// getDeadAgents returns all agent beads with state=dead.
|
|
||||||
func (d *Daemon) getDeadAgents() []deadAgentInfo {
|
|
||||||
cmd := exec.Command("bd", "list", "--type=agent", "--json")
|
cmd := exec.Command("bd", "list", "--type=agent", "--json")
|
||||||
cmd.Dir = d.config.TownRoot
|
cmd.Dir = d.config.TownRoot
|
||||||
|
|
||||||
output, err := cmd.Output()
|
output, err := cmd.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var agents []struct {
|
var agents []struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Type string `json:"issue_type"`
|
HookBead string `json:"hook_bead"`
|
||||||
HookBead string `json:"hook_bead"` // Read from database column
|
|
||||||
AgentState string `json:"agent_state"` // Read from database column
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := json.Unmarshal(output, &agents); err != nil {
|
if err := json.Unmarshal(output, &agents); err != nil {
|
||||||
return nil
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var dead []deadAgentInfo
|
prefix := "gt-polecat-" + rigName + "-"
|
||||||
for _, agent := range agents {
|
for _, agent := range agents {
|
||||||
if agent.AgentState == "dead" {
|
// Only check polecats for this rig
|
||||||
dead = append(dead, deadAgentInfo{
|
if !strings.HasPrefix(agent.ID, prefix) {
|
||||||
ID: agent.ID,
|
continue
|
||||||
HookBead: agent.HookBead,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return dead
|
// No hooked work = nothing to orphan
|
||||||
|
if agent.HookBead == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if tmux session is alive (derive state from tmux, not bead)
|
||||||
|
polecatName := strings.TrimPrefix(agent.ID, prefix)
|
||||||
|
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
|
||||||
|
|
||||||
|
// Session running = not orphaned (work is being processed)
|
||||||
|
if d.tmux.IsClaudeRunning(sessionName) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Session dead but has hooked work = orphaned!
|
||||||
|
d.logger.Printf("Orphaned work detected: agent %s session is dead but has hook_bead=%s",
|
||||||
|
agent.ID, agent.HookBead)
|
||||||
|
|
||||||
|
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// extractRigFromAgentID extracts the rig name from a polecat agent ID.
|
// extractRigFromAgentID extracts the rig name from a polecat agent ID.
|
||||||
|
|||||||
@@ -866,40 +866,39 @@ func countCommitsBehind(g *git.Git, defaultBranch string) int {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// assessStaleness determines if a polecat should be cleaned up.
|
// assessStaleness determines if a polecat should be cleaned up.
|
||||||
|
// Per gt-zecmc: uses tmux state (HasActiveSession) rather than agent_state
|
||||||
|
// since observable states (running, done, idle) are no longer recorded in beads.
|
||||||
func assessStaleness(info *StalenessInfo, threshold int) (bool, string) {
|
func assessStaleness(info *StalenessInfo, threshold int) (bool, string) {
|
||||||
// Never clean up if there's uncommitted work
|
// Never clean up if there's uncommitted work
|
||||||
if info.HasUncommittedWork {
|
if info.HasUncommittedWork {
|
||||||
return false, "has uncommitted work"
|
return false, "has uncommitted work"
|
||||||
}
|
}
|
||||||
|
|
||||||
// If session is active, not stale
|
// If session is active, not stale (tmux is source of truth for liveness)
|
||||||
if info.HasActiveSession {
|
if info.HasActiveSession {
|
||||||
return false, "session active"
|
return false, "session active"
|
||||||
}
|
}
|
||||||
|
|
||||||
// No active session - check other indicators
|
// No active session - this polecat is a cleanup candidate
|
||||||
|
// Check for reasons to keep it:
|
||||||
|
|
||||||
// If agent reports "running" state but no session, that's suspicious
|
// Check for non-observable states that indicate intentional pause
|
||||||
// but give benefit of doubt (session may have just died)
|
// (stuck, awaiting-gate are still stored in beads per gt-zecmc)
|
||||||
if info.AgentState == "running" {
|
if info.AgentState == "stuck" || info.AgentState == "awaiting-gate" {
|
||||||
return false, "agent reports running (session may be restarting)"
|
return false, fmt.Sprintf("agent_state=%s (intentional pause)", info.AgentState)
|
||||||
}
|
}
|
||||||
|
|
||||||
// If agent reports "done" or "idle", it's a cleanup candidate
|
// No session and way behind main = stale
|
||||||
if info.AgentState == "done" || info.AgentState == "idle" {
|
|
||||||
return true, fmt.Sprintf("agent_state=%s, no active session", info.AgentState)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Way behind main is a strong staleness signal
|
|
||||||
if info.CommitsBehind >= threshold {
|
if info.CommitsBehind >= threshold {
|
||||||
return true, fmt.Sprintf("%d commits behind main, no active session", info.CommitsBehind)
|
return true, fmt.Sprintf("%d commits behind main, no active session", info.CommitsBehind)
|
||||||
}
|
}
|
||||||
|
|
||||||
// No agent bead and no session - likely abandoned
|
// No session and no agent bead = abandoned, clean up
|
||||||
if info.AgentState == "" {
|
if info.AgentState == "" {
|
||||||
return true, "no agent bead, no active session"
|
return true, "no agent bead, no active session"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Default: not enough evidence to consider stale
|
// No session but has agent bead without special state = clean up
|
||||||
return false, "insufficient staleness indicators"
|
// (The session is the source of truth for liveness)
|
||||||
|
return true, "no active session"
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user