feat(deacon): add stale hooked bead cleanup (gt-2yls3)
Add `gt deacon stale-hooks` command to find and unhook stale beads. Problem: Beads can get stuck in 'hooked' status when agents die or abandon work without properly unhooking. Solution: - New command scans for hooked beads older than threshold (default 1h) - Checks if assignee agent is still alive (tmux session exists) - Unhooks beads with dead agents (sets status back to 'open') - Supports --dry-run to preview without making changes Also adds "stale-hook-check" step to Deacon patrol formula. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
ac63b10aa8
commit
74409dc32b
@@ -186,6 +186,22 @@ This helps the Deacon understand which agents may need attention.`,
|
|||||||
RunE: runDeaconHealthState,
|
RunE: runDeaconHealthState,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var deaconStaleHooksCmd = &cobra.Command{
|
||||||
|
Use: "stale-hooks",
|
||||||
|
Short: "Find and unhook stale hooked beads",
|
||||||
|
Long: `Find beads stuck in 'hooked' status and unhook them if the agent is gone.
|
||||||
|
|
||||||
|
Beads can get stuck in 'hooked' status when agents die or abandon work.
|
||||||
|
This command finds hooked beads older than the threshold (default: 1 hour),
|
||||||
|
checks if the assignee agent is still alive, and unhooks them if not.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
gt deacon stale-hooks # Find and unhook stale beads
|
||||||
|
gt deacon stale-hooks --dry-run # Preview what would be unhooked
|
||||||
|
gt deacon stale-hooks --max-age=30m # Use 30 minute threshold`,
|
||||||
|
RunE: runDeaconStaleHooks,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
triggerTimeout time.Duration
|
triggerTimeout time.Duration
|
||||||
@@ -198,6 +214,10 @@ var (
|
|||||||
// Force kill flags
|
// Force kill flags
|
||||||
forceKillReason string
|
forceKillReason string
|
||||||
forceKillSkipNotify bool
|
forceKillSkipNotify bool
|
||||||
|
|
||||||
|
// Stale hooks flags
|
||||||
|
staleHooksMaxAge time.Duration
|
||||||
|
staleHooksDryRun bool
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
@@ -211,6 +231,7 @@ func init() {
|
|||||||
deaconCmd.AddCommand(deaconHealthCheckCmd)
|
deaconCmd.AddCommand(deaconHealthCheckCmd)
|
||||||
deaconCmd.AddCommand(deaconForceKillCmd)
|
deaconCmd.AddCommand(deaconForceKillCmd)
|
||||||
deaconCmd.AddCommand(deaconHealthStateCmd)
|
deaconCmd.AddCommand(deaconHealthStateCmd)
|
||||||
|
deaconCmd.AddCommand(deaconStaleHooksCmd)
|
||||||
|
|
||||||
// Flags for trigger-pending
|
// Flags for trigger-pending
|
||||||
deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second,
|
deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second,
|
||||||
@@ -230,6 +251,12 @@ func init() {
|
|||||||
deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false,
|
deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false,
|
||||||
"Skip sending notification mail to mayor")
|
"Skip sending notification mail to mayor")
|
||||||
|
|
||||||
|
// Flags for stale-hooks
|
||||||
|
deaconStaleHooksCmd.Flags().DurationVar(&staleHooksMaxAge, "max-age", 1*time.Hour,
|
||||||
|
"Maximum age before a hooked bead is considered stale")
|
||||||
|
deaconStaleHooksCmd.Flags().BoolVar(&staleHooksDryRun, "dry-run", false,
|
||||||
|
"Preview what would be unhooked without making changes")
|
||||||
|
|
||||||
rootCmd.AddCommand(deaconCmd)
|
rootCmd.AddCommand(deaconCmd)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -908,3 +935,68 @@ func updateAgentBeadState(townRoot, agent, state, _ string) { // reason unused b
|
|||||||
_ = cmd.Run() // Best effort
|
_ = cmd.Run() // Best effort
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// runDeaconStaleHooks finds and unhooks stale hooked beads.
|
||||||
|
func runDeaconStaleHooks(cmd *cobra.Command, args []string) error {
|
||||||
|
townRoot, err := workspace.FindFromCwdOrError()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := &deacon.StaleHookConfig{
|
||||||
|
MaxAge: staleHooksMaxAge,
|
||||||
|
DryRun: staleHooksDryRun,
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := deacon.ScanStaleHooks(townRoot, cfg)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("scanning stale hooks: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print summary
|
||||||
|
if result.TotalHooked == 0 {
|
||||||
|
fmt.Printf("%s No hooked beads found\n", style.Dim.Render("○"))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("%s Found %d hooked bead(s), %d stale (older than %s)\n",
|
||||||
|
style.Bold.Render("●"), result.TotalHooked, result.StaleCount, staleHooksMaxAge)
|
||||||
|
|
||||||
|
if result.StaleCount == 0 {
|
||||||
|
fmt.Printf("%s No stale hooked beads\n", style.Dim.Render("○"))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print details for each stale bead
|
||||||
|
for _, r := range result.Results {
|
||||||
|
status := style.Dim.Render("○")
|
||||||
|
action := "skipped (agent alive)"
|
||||||
|
|
||||||
|
if !r.AgentAlive {
|
||||||
|
if staleHooksDryRun {
|
||||||
|
status = style.Bold.Render("?")
|
||||||
|
action = "would unhook (agent dead)"
|
||||||
|
} else if r.Unhooked {
|
||||||
|
status = style.Bold.Render("✓")
|
||||||
|
action = "unhooked (agent dead)"
|
||||||
|
} else if r.Error != "" {
|
||||||
|
status = style.Dim.Render("✗")
|
||||||
|
action = fmt.Sprintf("error: %s", r.Error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf(" %s %s: %s (age: %s, assignee: %s)\n",
|
||||||
|
status, r.BeadID, action, r.Age, r.Assignee)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Summary
|
||||||
|
if staleHooksDryRun {
|
||||||
|
fmt.Printf("\n%s Dry run - no changes made. Run without --dry-run to unhook.\n",
|
||||||
|
style.Dim.Render("ℹ"))
|
||||||
|
} else if result.Unhooked > 0 {
|
||||||
|
fmt.Printf("\n%s Unhooked %d stale bead(s)\n",
|
||||||
|
style.Bold.Render("✓"), result.Unhooked)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
194
internal/deacon/stale_hooks.go
Normal file
194
internal/deacon/stale_hooks.go
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
// Package deacon provides the Deacon agent infrastructure.
|
||||||
|
package deacon
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/steveyegge/gastown/internal/tmux"
|
||||||
|
)
|
||||||
|
|
||||||
|
// StaleHookConfig holds configurable parameters for stale hook detection.
|
||||||
|
type StaleHookConfig struct {
|
||||||
|
// MaxAge is how long a bead can be hooked before being considered stale.
|
||||||
|
MaxAge time.Duration `json:"max_age"`
|
||||||
|
// DryRun if true, only reports what would be done without making changes.
|
||||||
|
DryRun bool `json:"dry_run"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// DefaultStaleHookConfig returns the default stale hook config.
|
||||||
|
func DefaultStaleHookConfig() *StaleHookConfig {
|
||||||
|
return &StaleHookConfig{
|
||||||
|
MaxAge: 1 * time.Hour,
|
||||||
|
DryRun: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// HookedBead represents a bead in hooked status from bd list output.
|
||||||
|
type HookedBead struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Assignee string `json:"assignee"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// StaleHookResult represents the result of processing a stale hooked bead.
|
||||||
|
type StaleHookResult struct {
|
||||||
|
BeadID string `json:"bead_id"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Assignee string `json:"assignee"`
|
||||||
|
Age string `json:"age"`
|
||||||
|
AgentAlive bool `json:"agent_alive"`
|
||||||
|
Unhooked bool `json:"unhooked"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// StaleHookScanResult contains the full results of a stale hook scan.
|
||||||
|
type StaleHookScanResult struct {
|
||||||
|
ScannedAt time.Time `json:"scanned_at"`
|
||||||
|
TotalHooked int `json:"total_hooked"`
|
||||||
|
StaleCount int `json:"stale_count"`
|
||||||
|
Unhooked int `json:"unhooked"`
|
||||||
|
Results []*StaleHookResult `json:"results"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ScanStaleHooks finds hooked beads older than the threshold and optionally unhooks them.
|
||||||
|
func ScanStaleHooks(townRoot string, cfg *StaleHookConfig) (*StaleHookScanResult, error) {
|
||||||
|
if cfg == nil {
|
||||||
|
cfg = DefaultStaleHookConfig()
|
||||||
|
}
|
||||||
|
|
||||||
|
result := &StaleHookScanResult{
|
||||||
|
ScannedAt: time.Now().UTC(),
|
||||||
|
Results: make([]*StaleHookResult, 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all hooked beads
|
||||||
|
hookedBeads, err := listHookedBeads(townRoot)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("listing hooked beads: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
result.TotalHooked = len(hookedBeads)
|
||||||
|
|
||||||
|
// Filter to stale ones (older than threshold)
|
||||||
|
threshold := time.Now().Add(-cfg.MaxAge)
|
||||||
|
t := tmux.NewTmux()
|
||||||
|
|
||||||
|
for _, bead := range hookedBeads {
|
||||||
|
// Skip if updated recently (not stale)
|
||||||
|
if bead.UpdatedAt.After(threshold) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
result.StaleCount++
|
||||||
|
|
||||||
|
hookResult := &StaleHookResult{
|
||||||
|
BeadID: bead.ID,
|
||||||
|
Title: bead.Title,
|
||||||
|
Assignee: bead.Assignee,
|
||||||
|
Age: time.Since(bead.UpdatedAt).Round(time.Minute).String(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if assignee agent is still alive
|
||||||
|
if bead.Assignee != "" {
|
||||||
|
sessionName := assigneeToSessionName(bead.Assignee)
|
||||||
|
if sessionName != "" {
|
||||||
|
alive, _ := t.HasSession(sessionName)
|
||||||
|
hookResult.AgentAlive = alive
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If agent is dead/gone and not dry run, unhook the bead
|
||||||
|
if !hookResult.AgentAlive && !cfg.DryRun {
|
||||||
|
if err := unhookBead(townRoot, bead.ID); err != nil {
|
||||||
|
hookResult.Error = err.Error()
|
||||||
|
} else {
|
||||||
|
hookResult.Unhooked = true
|
||||||
|
result.Unhooked++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result.Results = append(result.Results, hookResult)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// listHookedBeads returns all beads with status=hooked.
|
||||||
|
func listHookedBeads(townRoot string) ([]*HookedBead, error) {
|
||||||
|
cmd := exec.Command("bd", "list", "--status=hooked", "--json", "--limit=0")
|
||||||
|
cmd.Dir = townRoot
|
||||||
|
|
||||||
|
output, err := cmd.Output()
|
||||||
|
if err != nil {
|
||||||
|
// No hooked beads is not an error
|
||||||
|
if strings.Contains(string(output), "no issues found") {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(output) == 0 || string(output) == "[]" || string(output) == "null\n" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var beads []*HookedBead
|
||||||
|
if err := json.Unmarshal(output, &beads); err != nil {
|
||||||
|
return nil, fmt.Errorf("parsing hooked beads: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return beads, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// assigneeToSessionName converts an assignee address to a tmux session name.
|
||||||
|
// Supports formats like "gastown/polecats/max", "gastown/crew/joe", etc.
|
||||||
|
func assigneeToSessionName(assignee string) string {
|
||||||
|
parts := strings.Split(assignee, "/")
|
||||||
|
|
||||||
|
switch len(parts) {
|
||||||
|
case 1:
|
||||||
|
// Simple names like "deacon", "mayor"
|
||||||
|
switch assignee {
|
||||||
|
case "deacon":
|
||||||
|
return "gt-deacon"
|
||||||
|
case "mayor":
|
||||||
|
return "gt-mayor"
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
case 2:
|
||||||
|
// rig/role: "gastown/witness", "gastown/refinery"
|
||||||
|
rig, role := parts[0], parts[1]
|
||||||
|
switch role {
|
||||||
|
case "witness", "refinery":
|
||||||
|
return fmt.Sprintf("gt-%s-%s", rig, role)
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
case 3:
|
||||||
|
// rig/type/name: "gastown/polecats/max", "gastown/crew/joe"
|
||||||
|
rig, agentType, name := parts[0], parts[1], parts[2]
|
||||||
|
switch agentType {
|
||||||
|
case "polecats":
|
||||||
|
return fmt.Sprintf("gt-%s-%s", rig, name)
|
||||||
|
case "crew":
|
||||||
|
return fmt.Sprintf("gt-%s-crew-%s", rig, name)
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// unhookBead sets a bead's status back to 'open'.
|
||||||
|
func unhookBead(townRoot, beadID string) error {
|
||||||
|
cmd := exec.Command("bd", "update", beadID, "--status=open")
|
||||||
|
cmd.Dir = townRoot
|
||||||
|
return cmd.Run()
|
||||||
|
}
|
||||||
@@ -340,10 +340,43 @@ gt mail send mayor/ -s "Health: <rig> <component> unresponsive" \\
|
|||||||
|
|
||||||
Reset unresponsive_cycles to 0 when component responds normally."""
|
Reset unresponsive_cycles to 0 when component responds normally."""
|
||||||
|
|
||||||
|
[[steps]]
|
||||||
|
id = "stale-hook-check"
|
||||||
|
title = "Cleanup stale hooked beads"
|
||||||
|
needs = ["health-scan"]
|
||||||
|
description = """
|
||||||
|
Find and unhook beads stuck in 'hooked' status.
|
||||||
|
|
||||||
|
Beads can get stuck in 'hooked' status when agents die or abandon work without
|
||||||
|
properly unhooking. This step cleans them up so the work can be reassigned.
|
||||||
|
|
||||||
|
**Step 1: Preview stale hooks**
|
||||||
|
```bash
|
||||||
|
gt deacon stale-hooks --dry-run
|
||||||
|
```
|
||||||
|
|
||||||
|
Review the output - it shows:
|
||||||
|
- Hooked beads older than 1 hour
|
||||||
|
- Whether the assignee agent is still alive
|
||||||
|
- What action would be taken
|
||||||
|
|
||||||
|
**Step 2: If stale hooks found with dead agents, unhook them**
|
||||||
|
```bash
|
||||||
|
gt deacon stale-hooks
|
||||||
|
```
|
||||||
|
|
||||||
|
This sets status back to 'open' for beads whose assignee agent is no longer running.
|
||||||
|
|
||||||
|
**Step 3: If no stale hooks**
|
||||||
|
No action needed - hooks are healthy.
|
||||||
|
|
||||||
|
**Note**: This is a backstop. Primary fix is ensuring agents properly unhook
|
||||||
|
beads when they exit or hand off work."""
|
||||||
|
|
||||||
[[steps]]
|
[[steps]]
|
||||||
id = "zombie-scan"
|
id = "zombie-scan"
|
||||||
title = "Backup check for zombie polecats"
|
title = "Backup check for zombie polecats"
|
||||||
needs = ["health-scan"]
|
needs = ["stale-hook-check"]
|
||||||
description = """
|
description = """
|
||||||
Defense-in-depth check for zombie polecats that Witness should have cleaned.
|
Defense-in-depth check for zombie polecats that Witness should have cleaned.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user