feat(deacon): Add zombie-scan backup check for idle polecats (gt-dpiw3)
Add gt deacon zombie-scan command that provides defense-in-depth against Witness cleanup failures. The command: - Scans all rigs for polecats that are idle, have no session running, no hooked work, and are stale (>10 min inactive) - Reports found zombies with details - Optionally nukes them and notifies the mayor Flags: --dry-run Preview only --threshold Custom staleness threshold (default 10m) --nuke=false Report only, do not clean up Also adds zombie-scan step to mol-deacon-patrol formula. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -316,10 +316,49 @@ gt mail send mayor/ -s "Health: <rig> <component> unresponsive" \\
|
|||||||
|
|
||||||
Reset unresponsive_cycles to 0 when component responds normally."""
|
Reset unresponsive_cycles to 0 when component responds normally."""
|
||||||
|
|
||||||
|
[[steps]]
|
||||||
|
id = "zombie-scan"
|
||||||
|
title = "Backup check for zombie polecats"
|
||||||
|
needs = ["health-scan"]
|
||||||
|
description = """
|
||||||
|
Defense-in-depth check for zombie polecats that Witness should have cleaned.
|
||||||
|
|
||||||
|
**Why this exists:**
|
||||||
|
The Witness is responsible for nuking polecats after they complete work (via POLECAT_DONE).
|
||||||
|
This step provides backup detection in case the Witness fails to clean up.
|
||||||
|
|
||||||
|
**Zombie criteria:**
|
||||||
|
- State: idle or done (no active work assigned)
|
||||||
|
- Session: not running (tmux session dead)
|
||||||
|
- No hooked work (nothing pending for this polecat)
|
||||||
|
- Last activity: older than 10 minutes
|
||||||
|
|
||||||
|
**Run the zombie scan:**
|
||||||
|
```bash
|
||||||
|
gt deacon zombie-scan --dry-run
|
||||||
|
```
|
||||||
|
|
||||||
|
**If zombies detected:**
|
||||||
|
1. Review the output to confirm they are truly abandoned
|
||||||
|
2. Run without --dry-run to nuke them:
|
||||||
|
```bash
|
||||||
|
gt deacon zombie-scan
|
||||||
|
```
|
||||||
|
3. This will:
|
||||||
|
- Nuke each zombie polecat
|
||||||
|
- Notify the Mayor about Witness failure
|
||||||
|
- Log the cleanup action
|
||||||
|
|
||||||
|
**If no zombies:**
|
||||||
|
No action needed - Witness is doing its job.
|
||||||
|
|
||||||
|
**Note:** This is a backup mechanism. If you frequently find zombies,
|
||||||
|
investigate why the Witness isn't cleaning up properly."""
|
||||||
|
|
||||||
[[steps]]
|
[[steps]]
|
||||||
id = "plugin-run"
|
id = "plugin-run"
|
||||||
title = "Execute registered plugins"
|
title = "Execute registered plugins"
|
||||||
needs = ["health-scan"]
|
needs = ["zombie-scan"]
|
||||||
description = """
|
description = """
|
||||||
Execute registered plugins.
|
Execute registered plugins.
|
||||||
|
|
||||||
|
|||||||
@@ -182,6 +182,33 @@ This helps the Deacon understand which agents may need attention.`,
|
|||||||
RunE: runDeaconHealthState,
|
RunE: runDeaconHealthState,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var deaconZombieScanCmd = &cobra.Command{
|
||||||
|
Use: "zombie-scan [rig]",
|
||||||
|
Short: "Scan for idle polecats that should have been nuked",
|
||||||
|
Long: `Backup check for polecats the Witness should have cleaned up.
|
||||||
|
|
||||||
|
Scans for "zombie" polecats that meet ALL of these criteria:
|
||||||
|
- State: idle or done (no active work)
|
||||||
|
- Session: not running (tmux session dead)
|
||||||
|
- No hooked work
|
||||||
|
- Last activity: older than threshold (default 10 minutes)
|
||||||
|
|
||||||
|
These are polecats that the Witness should have nuked but didn't.
|
||||||
|
This provides defense-in-depth against Witness failures.
|
||||||
|
|
||||||
|
Actions:
|
||||||
|
1. Log warning about witness failure
|
||||||
|
2. Nuke the zombie polecat directly
|
||||||
|
3. Notify mayor of witness issue (optional)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
gt deacon zombie-scan # Scan all rigs
|
||||||
|
gt deacon zombie-scan gastown # Scan specific rig
|
||||||
|
gt deacon zombie-scan --dry-run # Preview only
|
||||||
|
gt deacon zombie-scan --threshold=5m # Custom staleness threshold`,
|
||||||
|
Args: cobra.MaximumNArgs(1),
|
||||||
|
RunE: runDeaconZombieScan,
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
triggerTimeout time.Duration
|
triggerTimeout time.Duration
|
||||||
@@ -194,6 +221,11 @@ var (
|
|||||||
// Force kill flags
|
// Force kill flags
|
||||||
forceKillReason string
|
forceKillReason string
|
||||||
forceKillSkipNotify bool
|
forceKillSkipNotify bool
|
||||||
|
|
||||||
|
// Zombie scan flags
|
||||||
|
zombieScanDryRun bool
|
||||||
|
zombieScanThreshold time.Duration
|
||||||
|
zombieScanNuke bool
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
@@ -207,6 +239,7 @@ func init() {
|
|||||||
deaconCmd.AddCommand(deaconHealthCheckCmd)
|
deaconCmd.AddCommand(deaconHealthCheckCmd)
|
||||||
deaconCmd.AddCommand(deaconForceKillCmd)
|
deaconCmd.AddCommand(deaconForceKillCmd)
|
||||||
deaconCmd.AddCommand(deaconHealthStateCmd)
|
deaconCmd.AddCommand(deaconHealthStateCmd)
|
||||||
|
deaconCmd.AddCommand(deaconZombieScanCmd)
|
||||||
|
|
||||||
// Flags for trigger-pending
|
// Flags for trigger-pending
|
||||||
deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second,
|
deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second,
|
||||||
@@ -226,6 +259,14 @@ func init() {
|
|||||||
deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false,
|
deaconForceKillCmd.Flags().BoolVar(&forceKillSkipNotify, "skip-notify", false,
|
||||||
"Skip sending notification mail to mayor")
|
"Skip sending notification mail to mayor")
|
||||||
|
|
||||||
|
// Flags for zombie-scan
|
||||||
|
deaconZombieScanCmd.Flags().BoolVarP(&zombieScanDryRun, "dry-run", "n", false,
|
||||||
|
"Show what would be done without nuking")
|
||||||
|
deaconZombieScanCmd.Flags().DurationVar(&zombieScanThreshold, "threshold", 10*time.Minute,
|
||||||
|
"Staleness threshold for zombie detection")
|
||||||
|
deaconZombieScanCmd.Flags().BoolVar(&zombieScanNuke, "nuke", true,
|
||||||
|
"Nuke detected zombies (use --nuke=false to report only)")
|
||||||
|
|
||||||
rootCmd.AddCommand(deaconCmd)
|
rootCmd.AddCommand(deaconCmd)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -804,6 +845,258 @@ func runDeaconHealthState(cmd *cobra.Command, args []string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// runDeaconZombieScan scans for idle polecats that should have been nuked by the Witness.
|
||||||
|
// This is a defense-in-depth backup check.
|
||||||
|
func runDeaconZombieScan(cmd *cobra.Command, args []string) error {
|
||||||
|
townRoot, err := workspace.FindFromCwdOrError()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("not in a Gas Town workspace: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
t := tmux.NewTmux()
|
||||||
|
|
||||||
|
// Get list of rigs to scan
|
||||||
|
var rigsToScan []string
|
||||||
|
if len(args) > 0 {
|
||||||
|
rigsToScan = []string{args[0]}
|
||||||
|
} else {
|
||||||
|
// Scan all rigs by finding directories with polecats/ subdirectories
|
||||||
|
entries, err := os.ReadDir(townRoot)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("reading town root: %w", err)
|
||||||
|
}
|
||||||
|
for _, entry := range entries {
|
||||||
|
if !entry.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Skip non-rig directories
|
||||||
|
if entry.Name() == "deacon" || entry.Name() == "mayor" ||
|
||||||
|
entry.Name() == "plugins" || entry.Name() == "docs" ||
|
||||||
|
strings.HasPrefix(entry.Name(), ".") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Check if it has a polecats directory
|
||||||
|
polecatsDir := filepath.Join(townRoot, entry.Name(), "polecats")
|
||||||
|
if info, err := os.Stat(polecatsDir); err == nil && info.IsDir() {
|
||||||
|
rigsToScan = append(rigsToScan, entry.Name())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(rigsToScan) == 0 {
|
||||||
|
fmt.Printf("%s No rigs found to scan\n", style.Dim.Render("○"))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("%s Scanning for zombie polecats (threshold: %s)...\n",
|
||||||
|
style.Bold.Render("🧟"), zombieScanThreshold)
|
||||||
|
|
||||||
|
var zombies []zombieInfo
|
||||||
|
for _, rigName := range rigsToScan {
|
||||||
|
rigZombies, err := scanRigForZombies(townRoot, rigName, t)
|
||||||
|
if err != nil {
|
||||||
|
style.PrintWarning("failed to scan rig %s: %v", rigName, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
zombies = append(zombies, rigZombies...)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(zombies) == 0 {
|
||||||
|
fmt.Printf("%s No zombies found (all polecats healthy)\n", style.Bold.Render("✓"))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report zombies
|
||||||
|
fmt.Printf("\n%s Found %d zombie(s):\n\n", style.Bold.Render("⚠"), len(zombies))
|
||||||
|
for _, z := range zombies {
|
||||||
|
fmt.Printf(" %s %s/%s\n", style.Dim.Render("🧟"), z.rig, z.name)
|
||||||
|
fmt.Printf(" State: %s, Session: %s\n", z.state, z.sessionStatus)
|
||||||
|
fmt.Printf(" Hooked work: %s\n", z.hookedWork)
|
||||||
|
fmt.Printf(" Last activity: %s ago\n", z.staleness.Round(time.Second))
|
||||||
|
fmt.Printf(" Reason: %s\n", z.reason)
|
||||||
|
fmt.Println()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Nuke zombies if enabled
|
||||||
|
if zombieScanNuke && !zombieScanDryRun {
|
||||||
|
fmt.Printf("%s Nuking zombies...\n", style.Bold.Render("💀"))
|
||||||
|
for _, z := range zombies {
|
||||||
|
if err := nukeZombie(townRoot, z, t); err != nil {
|
||||||
|
style.PrintWarning("failed to nuke %s/%s: %v", z.rig, z.name, err)
|
||||||
|
} else {
|
||||||
|
fmt.Printf(" %s Nuked %s/%s\n", style.Bold.Render("✓"), z.rig, z.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Notify mayor about witness failure
|
||||||
|
notifyMayorOfWitnessFailure(townRoot, zombies)
|
||||||
|
} else if zombieScanDryRun {
|
||||||
|
fmt.Printf("%s Dry run - would nuke %d zombie(s)\n", style.Dim.Render("ℹ"), len(zombies))
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// zombieInfo holds information about a detected zombie polecat.
|
||||||
|
type zombieInfo struct {
|
||||||
|
rig string
|
||||||
|
name string
|
||||||
|
state string
|
||||||
|
sessionStatus string
|
||||||
|
hookedWork string
|
||||||
|
staleness time.Duration
|
||||||
|
reason string
|
||||||
|
sessionName string
|
||||||
|
}
|
||||||
|
|
||||||
|
// scanRigForZombies scans a rig for zombie polecats.
|
||||||
|
func scanRigForZombies(townRoot, rigName string, t *tmux.Tmux) ([]zombieInfo, error) {
|
||||||
|
rigPath := filepath.Join(townRoot, rigName)
|
||||||
|
polecatsDir := filepath.Join(rigPath, "polecats")
|
||||||
|
|
||||||
|
entries, err := os.ReadDir(polecatsDir)
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return nil, nil // No polecats dir
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var zombies []zombieInfo
|
||||||
|
for _, entry := range entries {
|
||||||
|
if !entry.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := entry.Name()
|
||||||
|
|
||||||
|
// Build session name for this polecat
|
||||||
|
sessionName := fmt.Sprintf("gt-%s-%s", rigName, name)
|
||||||
|
|
||||||
|
// Check if session is running
|
||||||
|
sessionRunning, _ := t.HasSession(sessionName)
|
||||||
|
|
||||||
|
// Check for hooked work
|
||||||
|
hookedWork := checkPolecatHookedWork(townRoot, rigName, name)
|
||||||
|
|
||||||
|
// Get last activity time from polecat directory
|
||||||
|
polecatPath := filepath.Join(polecatsDir, name)
|
||||||
|
staleness := getPolecatStaleness(polecatPath)
|
||||||
|
|
||||||
|
// Determine if this is a zombie
|
||||||
|
state := "unknown"
|
||||||
|
if sessionRunning {
|
||||||
|
state = "session_running"
|
||||||
|
continue // Not a zombie if session is running
|
||||||
|
}
|
||||||
|
state = "session_dead"
|
||||||
|
|
||||||
|
// Check all zombie criteria
|
||||||
|
if hookedWork != "" {
|
||||||
|
// Has hooked work - not a zombie (just needs to be started)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if staleness < zombieScanThreshold {
|
||||||
|
// Recently active - not stale enough
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is a zombie
|
||||||
|
zombies = append(zombies, zombieInfo{
|
||||||
|
rig: rigName,
|
||||||
|
name: name,
|
||||||
|
state: state,
|
||||||
|
sessionStatus: "not running",
|
||||||
|
hookedWork: "none",
|
||||||
|
staleness: staleness,
|
||||||
|
reason: fmt.Sprintf("idle for %s with no session or hooked work", staleness.Round(time.Minute)),
|
||||||
|
sessionName: sessionName,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return zombies, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// checkPolecatHookedWork checks if a polecat has hooked work.
|
||||||
|
func checkPolecatHookedWork(townRoot, rigName, polecatName string) string {
|
||||||
|
// Query beads for hooked issues assigned to this polecat
|
||||||
|
assignee := fmt.Sprintf("%s/polecats/%s", rigName, polecatName)
|
||||||
|
cmd := exec.Command("bd", "list", "--status=hooked", "--assignee="+assignee, "--json")
|
||||||
|
cmd.Dir = townRoot
|
||||||
|
|
||||||
|
output, err := cmd.Output()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
var issues []struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(output, &issues); err != nil || len(issues) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
return issues[0].ID
|
||||||
|
}
|
||||||
|
|
||||||
|
// getPolecatStaleness returns how long since the polecat was last active.
|
||||||
|
func getPolecatStaleness(polecatPath string) time.Duration {
|
||||||
|
// Check .beads/last-touched if it exists
|
||||||
|
lastTouchedPath := filepath.Join(polecatPath, ".beads", "last-touched")
|
||||||
|
if info, err := os.Stat(lastTouchedPath); err == nil {
|
||||||
|
return time.Since(info.ModTime())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to directory modification time
|
||||||
|
if info, err := os.Stat(polecatPath); err == nil {
|
||||||
|
return time.Since(info.ModTime())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Very stale if we can't determine
|
||||||
|
return 24 * time.Hour
|
||||||
|
}
|
||||||
|
|
||||||
|
// nukeZombie cleans up a zombie polecat.
|
||||||
|
func nukeZombie(townRoot string, z zombieInfo, t *tmux.Tmux) error {
|
||||||
|
// Step 1: Kill tmux session if somehow still exists
|
||||||
|
if exists, _ := t.HasSession(z.sessionName); exists {
|
||||||
|
_ = t.KillSession(z.sessionName)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Run gt polecat nuke to clean up
|
||||||
|
cmd := exec.Command("gt", "polecat", "nuke", z.name, "--rig="+z.rig, "--force")
|
||||||
|
cmd.Dir = townRoot
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
// Non-fatal - polecat might already be cleaned up
|
||||||
|
style.PrintWarning("polecat nuke returned error (may be already cleaned): %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// notifyMayorOfWitnessFailure notifies the mayor about witness cleanup failures.
|
||||||
|
func notifyMayorOfWitnessFailure(townRoot string, zombies []zombieInfo) {
|
||||||
|
if len(zombies) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Group by rig
|
||||||
|
rigCounts := make(map[string]int)
|
||||||
|
for _, z := range zombies {
|
||||||
|
rigCounts[z.rig]++
|
||||||
|
}
|
||||||
|
|
||||||
|
var details strings.Builder
|
||||||
|
details.WriteString("Deacon detected zombie polecats that Witness should have cleaned:\n\n")
|
||||||
|
for rig, count := range rigCounts {
|
||||||
|
details.WriteString(fmt.Sprintf("- %s: %d zombie(s)\n", rig, count))
|
||||||
|
}
|
||||||
|
details.WriteString("\nDeacon has nuked them directly. Check Witness health.")
|
||||||
|
|
||||||
|
sendMail(townRoot, "mayor/", "⚠️ Witness cleanup failure detected", details.String())
|
||||||
|
}
|
||||||
|
|
||||||
// agentAddressToIDs converts an agent address to bead ID and session name.
|
// agentAddressToIDs converts an agent address to bead ID and session name.
|
||||||
// Supports formats: "gastown/polecats/max", "gastown/witness", "deacon", "mayor"
|
// Supports formats: "gastown/polecats/max", "gastown/witness", "deacon", "mayor"
|
||||||
func agentAddressToIDs(address string) (beadID, sessionName string, err error) {
|
func agentAddressToIDs(address string) (beadID, sessionName string, err error) {
|
||||||
|
|||||||
Reference in New Issue
Block a user