feat(deacon): add stale hooked bead cleanup (gt-2yls3)
Add `gt deacon stale-hooks` command to find and unhook stale beads. Problem: Beads can get stuck in 'hooked' status when agents die or abandon work without properly unhooking. Solution: - New command scans for hooked beads older than threshold (default 1h) - Checks if assignee agent is still alive (tmux session exists) - Unhooks beads with dead agents (sets status back to 'open') - Supports --dry-run to preview without making changes Also adds "stale-hook-check" step to Deacon patrol formula. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
ac63b10aa8
commit
74409dc32b
@@ -186,6 +186,22 @@ This helps the Deacon understand which agents may need attention.`,
|
||||
RunE: runDeaconHealthState,
|
||||
}
|
||||
|
||||
var deaconStaleHooksCmd = &cobra.Command{
|
||||
Use: "stale-hooks",
|
||||
Short: "Find and unhook stale hooked beads",
|
||||
Long: `Find beads stuck in 'hooked' status and unhook them if the agent is gone.
|
||||
|
||||
Beads can get stuck in 'hooked' status when agents die or abandon work.
|
||||
This command finds hooked beads older than the threshold (default: 1 hour),
|
||||
checks if the assignee agent is still alive, and unhooks them if not.
|
||||
|
||||
Examples:
|
||||
gt deacon stale-hooks # Find and unhook stale beads
|
||||
gt deacon stale-hooks --dry-run # Preview what would be unhooked
|
||||
gt deacon stale-hooks --max-age=30m # Use 30 minute threshold`,
|
||||
RunE: runDeaconStaleHooks,
|
||||
}
|
||||
|
||||
|
||||
var (
|
||||
triggerTimeout time.Duration
|
||||
@@ -198,6 +214,10 @@ var (
|
||||
// Force kill flags
|
||||
forceKillReason string
|
||||
forceKillSkipNotify bool
|
||||
|
||||
// Stale hooks flags
|
||||
staleHooksMaxAge time.Duration
|
||||
staleHooksDryRun bool
|
||||
)
|
||||
|
||||
func init() {
|
||||
@@ -211,6 +231,7 @@ func init() {
|
||||
deaconCmd.AddCommand(deaconHealthCheckCmd)
|
||||
deaconCmd.AddCommand(deaconForceKillCmd)
|
||||
deaconCmd.AddCommand(deaconHealthStateCmd)
|
||||
deaconCmd.AddCommand(deaconStaleHooksCmd)
|
||||
|
||||
// Flags for trigger-pending
|
||||
deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second,
|
||||
@@ -230,6 +251,12 @@ func init() {
|
||||
deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false,
|
||||
"Skip sending notification mail to mayor")
|
||||
|
||||
// Flags for stale-hooks
|
||||
deaconStaleHooksCmd.Flags().DurationVar(&staleHooksMaxAge, "max-age", 1*time.Hour,
|
||||
"Maximum age before a hooked bead is considered stale")
|
||||
deaconStaleHooksCmd.Flags().BoolVar(&staleHooksDryRun, "dry-run", false,
|
||||
"Preview what would be unhooked without making changes")
|
||||
|
||||
rootCmd.AddCommand(deaconCmd)
|
||||
}
|
||||
|
||||
@@ -908,3 +935,68 @@ func updateAgentBeadState(townRoot, agent, state, _ string) { // reason unused b
|
||||
_ = cmd.Run() // Best effort
|
||||
}
|
||||
|
||||
// runDeaconStaleHooks finds and unhooks stale hooked beads.
|
||||
func runDeaconStaleHooks(cmd *cobra.Command, args []string) error {
|
||||
townRoot, err := workspace.FindFromCwdOrError()
|
||||
if err != nil {
|
||||
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
||||
}
|
||||
|
||||
cfg := &deacon.StaleHookConfig{
|
||||
MaxAge: staleHooksMaxAge,
|
||||
DryRun: staleHooksDryRun,
|
||||
}
|
||||
|
||||
result, err := deacon.ScanStaleHooks(townRoot, cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("scanning stale hooks: %w", err)
|
||||
}
|
||||
|
||||
// Print summary
|
||||
if result.TotalHooked == 0 {
|
||||
fmt.Printf("%s No hooked beads found\n", style.Dim.Render("○"))
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Printf("%s Found %d hooked bead(s), %d stale (older than %s)\n",
|
||||
style.Bold.Render("●"), result.TotalHooked, result.StaleCount, staleHooksMaxAge)
|
||||
|
||||
if result.StaleCount == 0 {
|
||||
fmt.Printf("%s No stale hooked beads\n", style.Dim.Render("○"))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Print details for each stale bead
|
||||
for _, r := range result.Results {
|
||||
status := style.Dim.Render("○")
|
||||
action := "skipped (agent alive)"
|
||||
|
||||
if !r.AgentAlive {
|
||||
if staleHooksDryRun {
|
||||
status = style.Bold.Render("?")
|
||||
action = "would unhook (agent dead)"
|
||||
} else if r.Unhooked {
|
||||
status = style.Bold.Render("✓")
|
||||
action = "unhooked (agent dead)"
|
||||
} else if r.Error != "" {
|
||||
status = style.Dim.Render("✗")
|
||||
action = fmt.Sprintf("error: %s", r.Error)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf(" %s %s: %s (age: %s, assignee: %s)\n",
|
||||
status, r.BeadID, action, r.Age, r.Assignee)
|
||||
}
|
||||
|
||||
// Summary
|
||||
if staleHooksDryRun {
|
||||
fmt.Printf("\n%s Dry run - no changes made. Run without --dry-run to unhook.\n",
|
||||
style.Dim.Render("ℹ"))
|
||||
} else if result.Unhooked > 0 {
|
||||
fmt.Printf("\n%s Unhooked %d stale bead(s)\n",
|
||||
style.Bold.Render("✓"), result.Unhooked)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
194
internal/deacon/stale_hooks.go
Normal file
194
internal/deacon/stale_hooks.go
Normal file
@@ -0,0 +1,194 @@
|
||||
// Package deacon provides the Deacon agent infrastructure.
|
||||
package deacon
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/steveyegge/gastown/internal/tmux"
|
||||
)
|
||||
|
||||
// StaleHookConfig holds configurable parameters for stale hook detection.
|
||||
type StaleHookConfig struct {
|
||||
// MaxAge is how long a bead can be hooked before being considered stale.
|
||||
MaxAge time.Duration `json:"max_age"`
|
||||
// DryRun if true, only reports what would be done without making changes.
|
||||
DryRun bool `json:"dry_run"`
|
||||
}
|
||||
|
||||
// DefaultStaleHookConfig returns the default stale hook config.
|
||||
func DefaultStaleHookConfig() *StaleHookConfig {
|
||||
return &StaleHookConfig{
|
||||
MaxAge: 1 * time.Hour,
|
||||
DryRun: false,
|
||||
}
|
||||
}
|
||||
|
||||
// HookedBead represents a bead in hooked status from bd list output.
|
||||
type HookedBead struct {
|
||||
ID string `json:"id"`
|
||||
Title string `json:"title"`
|
||||
Status string `json:"status"`
|
||||
Assignee string `json:"assignee"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// StaleHookResult represents the result of processing a stale hooked bead.
|
||||
type StaleHookResult struct {
|
||||
BeadID string `json:"bead_id"`
|
||||
Title string `json:"title"`
|
||||
Assignee string `json:"assignee"`
|
||||
Age string `json:"age"`
|
||||
AgentAlive bool `json:"agent_alive"`
|
||||
Unhooked bool `json:"unhooked"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// StaleHookScanResult contains the full results of a stale hook scan.
|
||||
type StaleHookScanResult struct {
|
||||
ScannedAt time.Time `json:"scanned_at"`
|
||||
TotalHooked int `json:"total_hooked"`
|
||||
StaleCount int `json:"stale_count"`
|
||||
Unhooked int `json:"unhooked"`
|
||||
Results []*StaleHookResult `json:"results"`
|
||||
}
|
||||
|
||||
// ScanStaleHooks finds hooked beads older than the threshold and optionally unhooks them.
|
||||
func ScanStaleHooks(townRoot string, cfg *StaleHookConfig) (*StaleHookScanResult, error) {
|
||||
if cfg == nil {
|
||||
cfg = DefaultStaleHookConfig()
|
||||
}
|
||||
|
||||
result := &StaleHookScanResult{
|
||||
ScannedAt: time.Now().UTC(),
|
||||
Results: make([]*StaleHookResult, 0),
|
||||
}
|
||||
|
||||
// Get all hooked beads
|
||||
hookedBeads, err := listHookedBeads(townRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("listing hooked beads: %w", err)
|
||||
}
|
||||
|
||||
result.TotalHooked = len(hookedBeads)
|
||||
|
||||
// Filter to stale ones (older than threshold)
|
||||
threshold := time.Now().Add(-cfg.MaxAge)
|
||||
t := tmux.NewTmux()
|
||||
|
||||
for _, bead := range hookedBeads {
|
||||
// Skip if updated recently (not stale)
|
||||
if bead.UpdatedAt.After(threshold) {
|
||||
continue
|
||||
}
|
||||
|
||||
result.StaleCount++
|
||||
|
||||
hookResult := &StaleHookResult{
|
||||
BeadID: bead.ID,
|
||||
Title: bead.Title,
|
||||
Assignee: bead.Assignee,
|
||||
Age: time.Since(bead.UpdatedAt).Round(time.Minute).String(),
|
||||
}
|
||||
|
||||
// Check if assignee agent is still alive
|
||||
if bead.Assignee != "" {
|
||||
sessionName := assigneeToSessionName(bead.Assignee)
|
||||
if sessionName != "" {
|
||||
alive, _ := t.HasSession(sessionName)
|
||||
hookResult.AgentAlive = alive
|
||||
}
|
||||
}
|
||||
|
||||
// If agent is dead/gone and not dry run, unhook the bead
|
||||
if !hookResult.AgentAlive && !cfg.DryRun {
|
||||
if err := unhookBead(townRoot, bead.ID); err != nil {
|
||||
hookResult.Error = err.Error()
|
||||
} else {
|
||||
hookResult.Unhooked = true
|
||||
result.Unhooked++
|
||||
}
|
||||
}
|
||||
|
||||
result.Results = append(result.Results, hookResult)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// listHookedBeads returns all beads with status=hooked.
|
||||
func listHookedBeads(townRoot string) ([]*HookedBead, error) {
|
||||
cmd := exec.Command("bd", "list", "--status=hooked", "--json", "--limit=0")
|
||||
cmd.Dir = townRoot
|
||||
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
// No hooked beads is not an error
|
||||
if strings.Contains(string(output), "no issues found") {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(output) == 0 || string(output) == "[]" || string(output) == "null\n" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var beads []*HookedBead
|
||||
if err := json.Unmarshal(output, &beads); err != nil {
|
||||
return nil, fmt.Errorf("parsing hooked beads: %w", err)
|
||||
}
|
||||
|
||||
return beads, nil
|
||||
}
|
||||
|
||||
// assigneeToSessionName converts an assignee address to a tmux session name.
|
||||
// Supports formats like "gastown/polecats/max", "gastown/crew/joe", etc.
|
||||
func assigneeToSessionName(assignee string) string {
|
||||
parts := strings.Split(assignee, "/")
|
||||
|
||||
switch len(parts) {
|
||||
case 1:
|
||||
// Simple names like "deacon", "mayor"
|
||||
switch assignee {
|
||||
case "deacon":
|
||||
return "gt-deacon"
|
||||
case "mayor":
|
||||
return "gt-mayor"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
case 2:
|
||||
// rig/role: "gastown/witness", "gastown/refinery"
|
||||
rig, role := parts[0], parts[1]
|
||||
switch role {
|
||||
case "witness", "refinery":
|
||||
return fmt.Sprintf("gt-%s-%s", rig, role)
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
case 3:
|
||||
// rig/type/name: "gastown/polecats/max", "gastown/crew/joe"
|
||||
rig, agentType, name := parts[0], parts[1], parts[2]
|
||||
switch agentType {
|
||||
case "polecats":
|
||||
return fmt.Sprintf("gt-%s-%s", rig, name)
|
||||
case "crew":
|
||||
return fmt.Sprintf("gt-%s-crew-%s", rig, name)
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// unhookBead sets a bead's status back to 'open'.
|
||||
func unhookBead(townRoot, beadID string) error {
|
||||
cmd := exec.Command("bd", "update", beadID, "--status=open")
|
||||
cmd.Dir = townRoot
|
||||
return cmd.Run()
|
||||
}
|
||||
@@ -340,10 +340,43 @@ gt mail send mayor/ -s "Health: <rig> <component> unresponsive" \\
|
||||
|
||||
Reset unresponsive_cycles to 0 when component responds normally."""
|
||||
|
||||
[[steps]]
|
||||
id = "stale-hook-check"
|
||||
title = "Cleanup stale hooked beads"
|
||||
needs = ["health-scan"]
|
||||
description = """
|
||||
Find and unhook beads stuck in 'hooked' status.
|
||||
|
||||
Beads can get stuck in 'hooked' status when agents die or abandon work without
|
||||
properly unhooking. This step cleans them up so the work can be reassigned.
|
||||
|
||||
**Step 1: Preview stale hooks**
|
||||
```bash
|
||||
gt deacon stale-hooks --dry-run
|
||||
```
|
||||
|
||||
Review the output - it shows:
|
||||
- Hooked beads older than 1 hour
|
||||
- Whether the assignee agent is still alive
|
||||
- What action would be taken
|
||||
|
||||
**Step 2: If stale hooks found with dead agents, unhook them**
|
||||
```bash
|
||||
gt deacon stale-hooks
|
||||
```
|
||||
|
||||
This sets status back to 'open' for beads whose assignee agent is no longer running.
|
||||
|
||||
**Step 3: If no stale hooks**
|
||||
No action needed - hooks are healthy.
|
||||
|
||||
**Note**: This is a backstop. Primary fix is ensuring agents properly unhook
|
||||
beads when they exit or hand off work."""
|
||||
|
||||
[[steps]]
|
||||
id = "zombie-scan"
|
||||
title = "Backup check for zombie polecats"
|
||||
needs = ["health-scan"]
|
||||
needs = ["stale-hook-check"]
|
||||
description = """
|
||||
Defense-in-depth check for zombie polecats that Witness should have cleaned.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user