Witness: Add recovery escalation for dormant polecats with unpushed work

- Add gt polecat check-recovery command to check cleanup_status from agent bead
- Update gt polecat nuke to block on polecats with unpushed work (use --force to override)
- Add EscalateRecoveryNeeded function for RECOVERY_NEEDED escalations
- Update Witness template with dormant polecat recovery protocol

Prevents accidental data loss when cleaning up dormant polecats that have
unpushed commits. The Witness should now use check-recovery before nuking
and escalate NEEDS_RECOVERY cases to the Mayor.

(gt-cloml)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gastown/polecats/slit
2025-12-30 20:59:46 -08:00
committed by Steve Yegge
parent d006b20d7c
commit aed2482d88
3 changed files with 314 additions and 15 deletions

View File

@@ -199,9 +199,11 @@ var (
polecatSyncFromMain bool polecatSyncFromMain bool
polecatStatusJSON bool polecatStatusJSON bool
polecatGitStateJSON bool polecatGitStateJSON bool
polecatGCDryRun bool polecatGCDryRun bool
polecatNukeAll bool polecatNukeAll bool
polecatNukeDryRun bool polecatNukeDryRun bool
polecatNukeForce bool
polecatCheckRecoveryJSON bool
) )
var polecatGCCmd = &cobra.Command{ var polecatGCCmd = &cobra.Command{
@@ -266,6 +268,25 @@ Examples:
RunE: runPolecatGitState, RunE: runPolecatGitState,
} }
var polecatCheckRecoveryCmd = &cobra.Command{
Use: "check-recovery <rig>/<polecat>",
Short: "Check if polecat needs recovery vs safe to nuke",
Long: `Check recovery status of a polecat based on cleanup_status in agent bead.
Used by the Witness to determine appropriate cleanup action:
- SAFE_TO_NUKE: cleanup_status is 'clean' - no work at risk
- NEEDS_RECOVERY: cleanup_status indicates unpushed/uncommitted work
This prevents accidental data loss when cleaning up dormant polecats.
The Witness should escalate NEEDS_RECOVERY cases to the Mayor.
Examples:
gt polecat check-recovery greenplace/Toast
gt polecat check-recovery greenplace/Toast --json`,
Args: cobra.ExactArgs(1),
RunE: runPolecatCheckRecovery,
}
func init() { func init() {
// List flags // List flags
polecatListCmd.Flags().BoolVar(&polecatListJSON, "json", false, "Output as JSON") polecatListCmd.Flags().BoolVar(&polecatListJSON, "json", false, "Output as JSON")
@@ -291,6 +312,10 @@ func init() {
// Nuke flags // Nuke flags
polecatNukeCmd.Flags().BoolVar(&polecatNukeAll, "all", false, "Nuke all polecats in the rig") polecatNukeCmd.Flags().BoolVar(&polecatNukeAll, "all", false, "Nuke all polecats in the rig")
polecatNukeCmd.Flags().BoolVar(&polecatNukeDryRun, "dry-run", false, "Show what would be nuked without doing it") polecatNukeCmd.Flags().BoolVar(&polecatNukeDryRun, "dry-run", false, "Show what would be nuked without doing it")
polecatNukeCmd.Flags().BoolVarP(&polecatNukeForce, "force", "f", false, "Force nuke even if polecat has unpushed work")
// Check-recovery flags
polecatCheckRecoveryCmd.Flags().BoolVar(&polecatCheckRecoveryJSON, "json", false, "Output as JSON")
// Add subcommands // Add subcommands
polecatCmd.AddCommand(polecatListCmd) polecatCmd.AddCommand(polecatListCmd)
@@ -303,6 +328,7 @@ func init() {
polecatCmd.AddCommand(polecatSyncCmd) polecatCmd.AddCommand(polecatSyncCmd)
polecatCmd.AddCommand(polecatStatusCmd) polecatCmd.AddCommand(polecatStatusCmd)
polecatCmd.AddCommand(polecatGitStateCmd) polecatCmd.AddCommand(polecatGitStateCmd)
polecatCmd.AddCommand(polecatCheckRecoveryCmd)
polecatCmd.AddCommand(polecatGCCmd) polecatCmd.AddCommand(polecatGCCmd)
polecatCmd.AddCommand(polecatNukeCmd) polecatCmd.AddCommand(polecatNukeCmd)
@@ -1054,6 +1080,122 @@ func getGitState(worktreePath string) (*GitState, error) {
return state, nil return state, nil
} }
// RecoveryStatus represents whether a polecat needs recovery or is safe to nuke.
type RecoveryStatus struct {
Rig string `json:"rig"`
Polecat string `json:"polecat"`
CleanupStatus string `json:"cleanup_status"`
NeedsRecovery bool `json:"needs_recovery"`
Verdict string `json:"verdict"` // SAFE_TO_NUKE or NEEDS_RECOVERY
Branch string `json:"branch,omitempty"`
Issue string `json:"issue,omitempty"`
}
func runPolecatCheckRecovery(cmd *cobra.Command, args []string) error {
rigName, polecatName, err := parseAddress(args[0])
if err != nil {
return err
}
mgr, r, err := getPolecatManager(rigName)
if err != nil {
return err
}
// Verify polecat exists and get info
p, err := mgr.Get(polecatName)
if err != nil {
return fmt.Errorf("polecat '%s' not found in rig '%s'", polecatName, rigName)
}
// Get cleanup_status from agent bead
// We need to read it directly from beads since manager doesn't expose it
rigPath := r.Path
bd := beads.New(rigPath)
agentBeadID := beads.PolecatBeadID(rigName, polecatName)
_, fields, err := bd.GetAgentBead(agentBeadID)
status := RecoveryStatus{
Rig: rigName,
Polecat: polecatName,
Branch: p.Branch,
Issue: p.Issue,
}
if err != nil || fields == nil {
// No agent bead or no cleanup_status - fall back to git check
// This handles polecats that haven't self-reported yet
gitState, gitErr := getGitState(p.ClonePath)
if gitErr != nil {
status.CleanupStatus = "unknown"
status.NeedsRecovery = true
status.Verdict = "NEEDS_RECOVERY"
} else if gitState.Clean {
status.CleanupStatus = "clean"
status.NeedsRecovery = false
status.Verdict = "SAFE_TO_NUKE"
} else if gitState.UnpushedCommits > 0 {
status.CleanupStatus = "has_unpushed"
status.NeedsRecovery = true
status.Verdict = "NEEDS_RECOVERY"
} else if gitState.StashCount > 0 {
status.CleanupStatus = "has_stash"
status.NeedsRecovery = true
status.Verdict = "NEEDS_RECOVERY"
} else {
status.CleanupStatus = "has_uncommitted"
status.NeedsRecovery = true
status.Verdict = "NEEDS_RECOVERY"
}
} else {
// Use cleanup_status from agent bead
status.CleanupStatus = fields.CleanupStatus
switch fields.CleanupStatus {
case "clean":
status.NeedsRecovery = false
status.Verdict = "SAFE_TO_NUKE"
case "has_uncommitted", "has_unpushed", "has_stash":
status.NeedsRecovery = true
status.Verdict = "NEEDS_RECOVERY"
default:
// Unknown or empty - be conservative
status.NeedsRecovery = true
status.Verdict = "NEEDS_RECOVERY"
}
}
// JSON output
if polecatCheckRecoveryJSON {
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
return enc.Encode(status)
}
// Human-readable output
fmt.Printf("%s\n\n", style.Bold.Render(fmt.Sprintf("Recovery Status: %s/%s", rigName, polecatName)))
fmt.Printf(" Cleanup Status: %s\n", status.CleanupStatus)
if status.Branch != "" {
fmt.Printf(" Branch: %s\n", status.Branch)
}
if status.Issue != "" {
fmt.Printf(" Issue: %s\n", status.Issue)
}
fmt.Println()
if status.NeedsRecovery {
fmt.Printf(" Verdict: %s\n", style.Error.Render("NEEDS_RECOVERY"))
fmt.Println()
fmt.Printf(" %s This polecat has unpushed/uncommitted work.\n", style.Warning.Render("⚠"))
fmt.Println(" Escalate to Mayor for recovery before cleanup.")
} else {
fmt.Printf(" Verdict: %s\n", style.Success.Render("SAFE_TO_NUKE"))
fmt.Println()
fmt.Printf(" %s Safe to nuke - no work at risk.\n", style.Success.Render("✓"))
}
return nil
}
func runPolecatGC(cmd *cobra.Command, args []string) error { func runPolecatGC(cmd *cobra.Command, args []string) error {
rigName := args[0] rigName := args[0]
@@ -1199,6 +1341,58 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
} }
} }
// Check recovery status for each polecat unless --force is set
// This prevents accidental data loss when nuking polecats with unpushed work
if !polecatNukeForce && !polecatNukeDryRun {
var needsRecovery []string
for _, p := range toNuke {
// Check cleanup_status from agent bead
bd := beads.New(p.r.Path)
agentBeadID := beads.PolecatBeadID(p.rigName, p.polecatName)
_, fields, err := bd.GetAgentBead(agentBeadID)
var recoveryNeeded bool
if err != nil || fields == nil {
// No agent bead - fall back to git check
polecatInfo, infoErr := p.mgr.Get(p.polecatName)
if infoErr == nil && polecatInfo != nil {
gitState, gitErr := getGitState(polecatInfo.ClonePath)
if gitErr != nil || !gitState.Clean {
recoveryNeeded = true
}
}
} else {
// Check cleanup_status from agent bead
switch fields.CleanupStatus {
case "clean":
recoveryNeeded = false
case "has_uncommitted", "has_unpushed", "has_stash", "unknown", "":
recoveryNeeded = true
default:
recoveryNeeded = true
}
}
if recoveryNeeded {
needsRecovery = append(needsRecovery, fmt.Sprintf("%s/%s", p.rigName, p.polecatName))
}
}
if len(needsRecovery) > 0 {
fmt.Printf("%s The following polecats have unpushed/uncommitted work:\n", style.Error.Render("Error:"))
for _, pc := range needsRecovery {
fmt.Printf(" - %s\n", pc)
}
fmt.Println()
fmt.Println("These polecats NEED RECOVERY before cleanup.")
fmt.Println("Options:")
fmt.Printf(" 1. Escalate to Mayor: gt mail send mayor/ -s \"RECOVERY_NEEDED\" -m \"...\"\n")
fmt.Printf(" 2. Force nuke (LOSES WORK): gt polecat nuke --force %s\n", strings.Join(needsRecovery, " "))
fmt.Println()
return fmt.Errorf("blocked: %d polecat(s) need recovery", len(needsRecovery))
}
}
// Nuke each polecat // Nuke each polecat
t := tmux.NewTmux() t := tmux.NewTmux()
var nukeErrors []string var nukeErrors []string
@@ -1214,7 +1408,11 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
continue continue
} }
fmt.Printf("Nuking %s/%s...\n", p.rigName, p.polecatName) if polecatNukeForce {
fmt.Printf("%s Nuking %s/%s (--force)...\n", style.Warning.Render("⚠"), p.rigName, p.polecatName)
} else {
fmt.Printf("Nuking %s/%s...\n", p.rigName, p.polecatName)
}
// Step 1: Kill session (force mode - no graceful shutdown) // Step 1: Kill session (force mode - no graceful shutdown)
sessMgr := session.NewManager(t, p.r) sessMgr := session.NewManager(t, p.r)

View File

@@ -348,6 +348,55 @@ Requested at: %s`,
return msg.ID, nil return msg.ID, nil
} }
// RecoveryPayload contains data for RECOVERY_NEEDED escalation.
type RecoveryPayload struct {
PolecatName string
Rig string
CleanupStatus string
Branch string
IssueID string
DetectedAt time.Time
}
// EscalateRecoveryNeeded sends a RECOVERY_NEEDED escalation to the Mayor.
// This is used when a dormant polecat has unpushed work that needs recovery
// before cleanup. The Mayor should coordinate recovery (e.g., push the branch,
// save the work) before authorizing cleanup.
func EscalateRecoveryNeeded(router *mail.Router, rigName string, payload *RecoveryPayload) (string, error) {
msg := &mail.Message{
From: fmt.Sprintf("%s/witness", rigName),
To: "mayor/",
Subject: fmt.Sprintf("RECOVERY_NEEDED %s/%s", rigName, payload.PolecatName),
Priority: mail.PriorityUrgent,
Body: fmt.Sprintf(`Polecat: %s/%s
Cleanup Status: %s
Branch: %s
Issue: %s
Detected: %s
This polecat has unpushed/uncommitted work that will be lost if nuked.
Please coordinate recovery before authorizing cleanup:
1. Check if branch can be pushed to origin
2. Review uncommitted changes for value
3. Either recover the work or authorize force-nuke
DO NOT nuke without --force after recovery.`,
rigName,
payload.PolecatName,
payload.CleanupStatus,
payload.Branch,
payload.IssueID,
payload.DetectedAt.Format(time.RFC3339),
),
}
if err := router.Send(msg); err != nil {
return "", err
}
return msg.ID, nil
}
// UpdateCleanupWispState updates a cleanup wisp's state label. // UpdateCleanupWispState updates a cleanup wisp's state label.
func UpdateCleanupWispState(workDir, wispID, newState string) error { func UpdateCleanupWispState(workDir, wispID, newState string) error {
// Get current labels to preserve other labels // Get current labels to preserve other labels

View File

@@ -30,28 +30,73 @@ Check your mail with: `gt mail inbox`
--- ---
## Dormant Polecat Recovery Protocol
When checking dormant polecats, use the recovery check command:
```bash
gt polecat check-recovery {{RIG}}/<name>
```
This returns one of:
- **SAFE_TO_NUKE**: cleanup_status is 'clean' - proceed with normal cleanup
- **NEEDS_RECOVERY**: cleanup_status indicates unpushed/uncommitted work
### If NEEDS_RECOVERY
**CRITICAL: Do NOT auto-nuke polecats with unpushed work.**
Instead, escalate to Mayor:
```bash
gt mail send mayor/ -s "RECOVERY_NEEDED {{RIG}}/<polecat>" -m "Cleanup Status: has_unpushed
Branch: <branch-name>
Issue: <issue-id>
Detected: $(date -Iseconds)
This polecat has unpushed work that will be lost if nuked.
Please coordinate recovery before authorizing cleanup."
```
The nuke command will block automatically:
```bash
$ gt polecat nuke {{RIG}}/<name>
Error: The following polecats have unpushed/uncommitted work:
- {{RIG}}/<name>
These polecats NEED RECOVERY before cleanup.
Options:
1. Escalate to Mayor: gt mail send mayor/ -s "RECOVERY_NEEDED" -m "..."
2. Force nuke (LOSES WORK): gt polecat nuke --force {{RIG}}/<name>
```
Only use `--force` after Mayor authorizes or confirms work is unrecoverable.
---
## Pre-Kill Verification Checklist ## Pre-Kill Verification Checklist
Before killing ANY polecat session, verify: Before killing ANY polecat session, verify:
``` ```
[ ] 1. gt polecat git-state <name> # Must be clean [ ] 1. gt polecat check-recovery {{RIG}}/<name> # Must be SAFE_TO_NUKE
[ ] 2. Check for uncommitted work: [ ] 2. gt polecat git-state <name> # Must be clean
cd polecats/<name> && git status [ ] 3. Verify issue closed:
[ ] 3. Check for unpushed commits:
git log origin/main..HEAD
[ ] 4. Verify issue closed:
bd show <issue-id> # Should show 'closed' bd show <issue-id> # Should show 'closed'
[ ] 5. Verify PR submitted (if applicable): [ ] 4. Verify PR submitted (if applicable):
Check merge queue or PR status Check merge queue or PR status
``` ```
**If git state is dirty:** **If NEEDS_RECOVERY:**
1. Send RECOVERY_NEEDED escalation to Mayor (see above)
2. Wait for Mayor authorization
3. Do NOT proceed with nuke
**If git state dirty but polecat still alive:**
1. Nudge the worker to clean up 1. Nudge the worker to clean up
2. Wait 5 minutes for response 2. Wait 5 minutes for response
3. If still dirty after 3 attempts → Escalate to Mayor 3. If still dirty after 3 attempts → Escalate to Mayor
**If all checks pass:** **If SAFE_TO_NUKE and all checks pass:**
1. **Send MERGE_READY to refinery** (CRITICAL - do this BEFORE killing): 1. **Send MERGE_READY to refinery** (CRITICAL - do this BEFORE killing):
```bash ```bash
gt mail send {{RIG}}/refinery -s "MERGE_READY <polecat>" -m "Branch: <branch> gt mail send {{RIG}}/refinery -s "MERGE_READY <polecat>" -m "Branch: <branch>
@@ -65,6 +110,7 @@ Before killing ANY polecat session, verify:
``` ```
NOTE: Use `gt polecat nuke` instead of raw git commands. It knows the correct NOTE: Use `gt polecat nuke` instead of raw git commands. It knows the correct
worktree parent repo (mayor/rig or .repo.git) and handles cleanup properly. worktree parent repo (mayor/rig or .repo.git) and handles cleanup properly.
The nuke will automatically block if cleanup_status indicates unpushed work.
3. **Notify Mayor** (for tracking): 3. **Notify Mayor** (for tracking):
```bash ```bash
gt mail send mayor/ -s "Polecat <name> processed" -m "Work: <issue> gt mail send mayor/ -s "Polecat <name> processed" -m "Work: <issue>
@@ -77,8 +123,11 @@ Before killing ANY polecat session, verify:
```bash ```bash
# Polecat management # Polecat management
gt polecat list {{RIG}} # See all polecats gt polecat list {{RIG}} # See all polecats
gt polecat git-state <name> # Check git cleanliness gt polecat check-recovery {{RIG}}/<name> # Check if safe to nuke
gt polecat git-state {{RIG}}/<name> # Check git cleanliness
gt polecat nuke {{RIG}}/<name> # Nuke (blocks on unpushed work)
gt polecat nuke --force {{RIG}}/<name> # Force nuke (LOSES WORK)
# Session inspection # Session inspection
tmux capture-pane -t gt-{{RIG}}-<name> -p | tail -40 tmux capture-pane -t gt-{{RIG}}-<name> -p | tail -40
@@ -91,12 +140,15 @@ gt mail inbox
gt mail read <id> gt mail read <id>
gt mail send mayor/ -s "Subject" -m "Message" gt mail send mayor/ -s "Subject" -m "Message"
gt mail send {{RIG}}/refinery -s "MERGE_READY <polecat>" -m "..." gt mail send {{RIG}}/refinery -s "MERGE_READY <polecat>" -m "..."
gt mail send mayor/ -s "RECOVERY_NEEDED {{RIG}}/<polecat>" -m "..." # Escalate
``` ```
--- ---
## Do NOT ## Do NOT
- **Nuke polecats with unpushed work** - always check-recovery first
- Use `--force` without Mayor authorization
- Kill sessions without completing pre-kill verification - Kill sessions without completing pre-kill verification
- Kill sessions without sending MERGE_READY to refinery - Kill sessions without sending MERGE_READY to refinery
- Spawn new polecats (Mayor does that) - Spawn new polecats (Mayor does that)