Witness: Add recovery escalation for dormant polecats with unpushed work
- Add gt polecat check-recovery command to check cleanup_status from agent bead - Update gt polecat nuke to block on polecats with unpushed work (use --force to override) - Add EscalateRecoveryNeeded function for RECOVERY_NEEDED escalations - Update Witness template with dormant polecat recovery protocol Prevents accidental data loss when cleaning up dormant polecats that have unpushed commits. The Witness should now use check-recovery before nuking and escalate NEEDS_RECOVERY cases to the Mayor. (gt-cloml) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
d006b20d7c
commit
aed2482d88
@@ -202,6 +202,8 @@ var (
|
||||
polecatGCDryRun bool
|
||||
polecatNukeAll bool
|
||||
polecatNukeDryRun bool
|
||||
polecatNukeForce bool
|
||||
polecatCheckRecoveryJSON bool
|
||||
)
|
||||
|
||||
var polecatGCCmd = &cobra.Command{
|
||||
@@ -266,6 +268,25 @@ Examples:
|
||||
RunE: runPolecatGitState,
|
||||
}
|
||||
|
||||
var polecatCheckRecoveryCmd = &cobra.Command{
|
||||
Use: "check-recovery <rig>/<polecat>",
|
||||
Short: "Check if polecat needs recovery vs safe to nuke",
|
||||
Long: `Check recovery status of a polecat based on cleanup_status in agent bead.
|
||||
|
||||
Used by the Witness to determine appropriate cleanup action:
|
||||
- SAFE_TO_NUKE: cleanup_status is 'clean' - no work at risk
|
||||
- NEEDS_RECOVERY: cleanup_status indicates unpushed/uncommitted work
|
||||
|
||||
This prevents accidental data loss when cleaning up dormant polecats.
|
||||
The Witness should escalate NEEDS_RECOVERY cases to the Mayor.
|
||||
|
||||
Examples:
|
||||
gt polecat check-recovery greenplace/Toast
|
||||
gt polecat check-recovery greenplace/Toast --json`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: runPolecatCheckRecovery,
|
||||
}
|
||||
|
||||
func init() {
|
||||
// List flags
|
||||
polecatListCmd.Flags().BoolVar(&polecatListJSON, "json", false, "Output as JSON")
|
||||
@@ -291,6 +312,10 @@ func init() {
|
||||
// Nuke flags
|
||||
polecatNukeCmd.Flags().BoolVar(&polecatNukeAll, "all", false, "Nuke all polecats in the rig")
|
||||
polecatNukeCmd.Flags().BoolVar(&polecatNukeDryRun, "dry-run", false, "Show what would be nuked without doing it")
|
||||
polecatNukeCmd.Flags().BoolVarP(&polecatNukeForce, "force", "f", false, "Force nuke even if polecat has unpushed work")
|
||||
|
||||
// Check-recovery flags
|
||||
polecatCheckRecoveryCmd.Flags().BoolVar(&polecatCheckRecoveryJSON, "json", false, "Output as JSON")
|
||||
|
||||
// Add subcommands
|
||||
polecatCmd.AddCommand(polecatListCmd)
|
||||
@@ -303,6 +328,7 @@ func init() {
|
||||
polecatCmd.AddCommand(polecatSyncCmd)
|
||||
polecatCmd.AddCommand(polecatStatusCmd)
|
||||
polecatCmd.AddCommand(polecatGitStateCmd)
|
||||
polecatCmd.AddCommand(polecatCheckRecoveryCmd)
|
||||
polecatCmd.AddCommand(polecatGCCmd)
|
||||
polecatCmd.AddCommand(polecatNukeCmd)
|
||||
|
||||
@@ -1054,6 +1080,122 @@ func getGitState(worktreePath string) (*GitState, error) {
|
||||
return state, nil
|
||||
}
|
||||
|
||||
// RecoveryStatus represents whether a polecat needs recovery or is safe to nuke.
|
||||
type RecoveryStatus struct {
|
||||
Rig string `json:"rig"`
|
||||
Polecat string `json:"polecat"`
|
||||
CleanupStatus string `json:"cleanup_status"`
|
||||
NeedsRecovery bool `json:"needs_recovery"`
|
||||
Verdict string `json:"verdict"` // SAFE_TO_NUKE or NEEDS_RECOVERY
|
||||
Branch string `json:"branch,omitempty"`
|
||||
Issue string `json:"issue,omitempty"`
|
||||
}
|
||||
|
||||
func runPolecatCheckRecovery(cmd *cobra.Command, args []string) error {
|
||||
rigName, polecatName, err := parseAddress(args[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
mgr, r, err := getPolecatManager(rigName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Verify polecat exists and get info
|
||||
p, err := mgr.Get(polecatName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("polecat '%s' not found in rig '%s'", polecatName, rigName)
|
||||
}
|
||||
|
||||
// Get cleanup_status from agent bead
|
||||
// We need to read it directly from beads since manager doesn't expose it
|
||||
rigPath := r.Path
|
||||
bd := beads.New(rigPath)
|
||||
agentBeadID := beads.PolecatBeadID(rigName, polecatName)
|
||||
_, fields, err := bd.GetAgentBead(agentBeadID)
|
||||
|
||||
status := RecoveryStatus{
|
||||
Rig: rigName,
|
||||
Polecat: polecatName,
|
||||
Branch: p.Branch,
|
||||
Issue: p.Issue,
|
||||
}
|
||||
|
||||
if err != nil || fields == nil {
|
||||
// No agent bead or no cleanup_status - fall back to git check
|
||||
// This handles polecats that haven't self-reported yet
|
||||
gitState, gitErr := getGitState(p.ClonePath)
|
||||
if gitErr != nil {
|
||||
status.CleanupStatus = "unknown"
|
||||
status.NeedsRecovery = true
|
||||
status.Verdict = "NEEDS_RECOVERY"
|
||||
} else if gitState.Clean {
|
||||
status.CleanupStatus = "clean"
|
||||
status.NeedsRecovery = false
|
||||
status.Verdict = "SAFE_TO_NUKE"
|
||||
} else if gitState.UnpushedCommits > 0 {
|
||||
status.CleanupStatus = "has_unpushed"
|
||||
status.NeedsRecovery = true
|
||||
status.Verdict = "NEEDS_RECOVERY"
|
||||
} else if gitState.StashCount > 0 {
|
||||
status.CleanupStatus = "has_stash"
|
||||
status.NeedsRecovery = true
|
||||
status.Verdict = "NEEDS_RECOVERY"
|
||||
} else {
|
||||
status.CleanupStatus = "has_uncommitted"
|
||||
status.NeedsRecovery = true
|
||||
status.Verdict = "NEEDS_RECOVERY"
|
||||
}
|
||||
} else {
|
||||
// Use cleanup_status from agent bead
|
||||
status.CleanupStatus = fields.CleanupStatus
|
||||
switch fields.CleanupStatus {
|
||||
case "clean":
|
||||
status.NeedsRecovery = false
|
||||
status.Verdict = "SAFE_TO_NUKE"
|
||||
case "has_uncommitted", "has_unpushed", "has_stash":
|
||||
status.NeedsRecovery = true
|
||||
status.Verdict = "NEEDS_RECOVERY"
|
||||
default:
|
||||
// Unknown or empty - be conservative
|
||||
status.NeedsRecovery = true
|
||||
status.Verdict = "NEEDS_RECOVERY"
|
||||
}
|
||||
}
|
||||
|
||||
// JSON output
|
||||
if polecatCheckRecoveryJSON {
|
||||
enc := json.NewEncoder(os.Stdout)
|
||||
enc.SetIndent("", " ")
|
||||
return enc.Encode(status)
|
||||
}
|
||||
|
||||
// Human-readable output
|
||||
fmt.Printf("%s\n\n", style.Bold.Render(fmt.Sprintf("Recovery Status: %s/%s", rigName, polecatName)))
|
||||
fmt.Printf(" Cleanup Status: %s\n", status.CleanupStatus)
|
||||
if status.Branch != "" {
|
||||
fmt.Printf(" Branch: %s\n", status.Branch)
|
||||
}
|
||||
if status.Issue != "" {
|
||||
fmt.Printf(" Issue: %s\n", status.Issue)
|
||||
}
|
||||
fmt.Println()
|
||||
|
||||
if status.NeedsRecovery {
|
||||
fmt.Printf(" Verdict: %s\n", style.Error.Render("NEEDS_RECOVERY"))
|
||||
fmt.Println()
|
||||
fmt.Printf(" %s This polecat has unpushed/uncommitted work.\n", style.Warning.Render("⚠"))
|
||||
fmt.Println(" Escalate to Mayor for recovery before cleanup.")
|
||||
} else {
|
||||
fmt.Printf(" Verdict: %s\n", style.Success.Render("SAFE_TO_NUKE"))
|
||||
fmt.Println()
|
||||
fmt.Printf(" %s Safe to nuke - no work at risk.\n", style.Success.Render("✓"))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runPolecatGC(cmd *cobra.Command, args []string) error {
|
||||
rigName := args[0]
|
||||
|
||||
@@ -1199,6 +1341,58 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Check recovery status for each polecat unless --force is set
|
||||
// This prevents accidental data loss when nuking polecats with unpushed work
|
||||
if !polecatNukeForce && !polecatNukeDryRun {
|
||||
var needsRecovery []string
|
||||
for _, p := range toNuke {
|
||||
// Check cleanup_status from agent bead
|
||||
bd := beads.New(p.r.Path)
|
||||
agentBeadID := beads.PolecatBeadID(p.rigName, p.polecatName)
|
||||
_, fields, err := bd.GetAgentBead(agentBeadID)
|
||||
|
||||
var recoveryNeeded bool
|
||||
if err != nil || fields == nil {
|
||||
// No agent bead - fall back to git check
|
||||
polecatInfo, infoErr := p.mgr.Get(p.polecatName)
|
||||
if infoErr == nil && polecatInfo != nil {
|
||||
gitState, gitErr := getGitState(polecatInfo.ClonePath)
|
||||
if gitErr != nil || !gitState.Clean {
|
||||
recoveryNeeded = true
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Check cleanup_status from agent bead
|
||||
switch fields.CleanupStatus {
|
||||
case "clean":
|
||||
recoveryNeeded = false
|
||||
case "has_uncommitted", "has_unpushed", "has_stash", "unknown", "":
|
||||
recoveryNeeded = true
|
||||
default:
|
||||
recoveryNeeded = true
|
||||
}
|
||||
}
|
||||
|
||||
if recoveryNeeded {
|
||||
needsRecovery = append(needsRecovery, fmt.Sprintf("%s/%s", p.rigName, p.polecatName))
|
||||
}
|
||||
}
|
||||
|
||||
if len(needsRecovery) > 0 {
|
||||
fmt.Printf("%s The following polecats have unpushed/uncommitted work:\n", style.Error.Render("Error:"))
|
||||
for _, pc := range needsRecovery {
|
||||
fmt.Printf(" - %s\n", pc)
|
||||
}
|
||||
fmt.Println()
|
||||
fmt.Println("These polecats NEED RECOVERY before cleanup.")
|
||||
fmt.Println("Options:")
|
||||
fmt.Printf(" 1. Escalate to Mayor: gt mail send mayor/ -s \"RECOVERY_NEEDED\" -m \"...\"\n")
|
||||
fmt.Printf(" 2. Force nuke (LOSES WORK): gt polecat nuke --force %s\n", strings.Join(needsRecovery, " "))
|
||||
fmt.Println()
|
||||
return fmt.Errorf("blocked: %d polecat(s) need recovery", len(needsRecovery))
|
||||
}
|
||||
}
|
||||
|
||||
// Nuke each polecat
|
||||
t := tmux.NewTmux()
|
||||
var nukeErrors []string
|
||||
@@ -1214,7 +1408,11 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
|
||||
continue
|
||||
}
|
||||
|
||||
if polecatNukeForce {
|
||||
fmt.Printf("%s Nuking %s/%s (--force)...\n", style.Warning.Render("⚠"), p.rigName, p.polecatName)
|
||||
} else {
|
||||
fmt.Printf("Nuking %s/%s...\n", p.rigName, p.polecatName)
|
||||
}
|
||||
|
||||
// Step 1: Kill session (force mode - no graceful shutdown)
|
||||
sessMgr := session.NewManager(t, p.r)
|
||||
|
||||
@@ -348,6 +348,55 @@ Requested at: %s`,
|
||||
return msg.ID, nil
|
||||
}
|
||||
|
||||
// RecoveryPayload contains data for RECOVERY_NEEDED escalation.
|
||||
type RecoveryPayload struct {
|
||||
PolecatName string
|
||||
Rig string
|
||||
CleanupStatus string
|
||||
Branch string
|
||||
IssueID string
|
||||
DetectedAt time.Time
|
||||
}
|
||||
|
||||
// EscalateRecoveryNeeded sends a RECOVERY_NEEDED escalation to the Mayor.
|
||||
// This is used when a dormant polecat has unpushed work that needs recovery
|
||||
// before cleanup. The Mayor should coordinate recovery (e.g., push the branch,
|
||||
// save the work) before authorizing cleanup.
|
||||
func EscalateRecoveryNeeded(router *mail.Router, rigName string, payload *RecoveryPayload) (string, error) {
|
||||
msg := &mail.Message{
|
||||
From: fmt.Sprintf("%s/witness", rigName),
|
||||
To: "mayor/",
|
||||
Subject: fmt.Sprintf("RECOVERY_NEEDED %s/%s", rigName, payload.PolecatName),
|
||||
Priority: mail.PriorityUrgent,
|
||||
Body: fmt.Sprintf(`Polecat: %s/%s
|
||||
Cleanup Status: %s
|
||||
Branch: %s
|
||||
Issue: %s
|
||||
Detected: %s
|
||||
|
||||
This polecat has unpushed/uncommitted work that will be lost if nuked.
|
||||
Please coordinate recovery before authorizing cleanup:
|
||||
1. Check if branch can be pushed to origin
|
||||
2. Review uncommitted changes for value
|
||||
3. Either recover the work or authorize force-nuke
|
||||
|
||||
DO NOT nuke without --force after recovery.`,
|
||||
rigName,
|
||||
payload.PolecatName,
|
||||
payload.CleanupStatus,
|
||||
payload.Branch,
|
||||
payload.IssueID,
|
||||
payload.DetectedAt.Format(time.RFC3339),
|
||||
),
|
||||
}
|
||||
|
||||
if err := router.Send(msg); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return msg.ID, nil
|
||||
}
|
||||
|
||||
// UpdateCleanupWispState updates a cleanup wisp's state label.
|
||||
func UpdateCleanupWispState(workDir, wispID, newState string) error {
|
||||
// Get current labels to preserve other labels
|
||||
|
||||
@@ -30,28 +30,73 @@ Check your mail with: `gt mail inbox`
|
||||
|
||||
---
|
||||
|
||||
## Dormant Polecat Recovery Protocol
|
||||
|
||||
When checking dormant polecats, use the recovery check command:
|
||||
|
||||
```bash
|
||||
gt polecat check-recovery {{RIG}}/<name>
|
||||
```
|
||||
|
||||
This returns one of:
|
||||
- **SAFE_TO_NUKE**: cleanup_status is 'clean' - proceed with normal cleanup
|
||||
- **NEEDS_RECOVERY**: cleanup_status indicates unpushed/uncommitted work
|
||||
|
||||
### If NEEDS_RECOVERY
|
||||
|
||||
**CRITICAL: Do NOT auto-nuke polecats with unpushed work.**
|
||||
|
||||
Instead, escalate to Mayor:
|
||||
```bash
|
||||
gt mail send mayor/ -s "RECOVERY_NEEDED {{RIG}}/<polecat>" -m "Cleanup Status: has_unpushed
|
||||
Branch: <branch-name>
|
||||
Issue: <issue-id>
|
||||
Detected: $(date -Iseconds)
|
||||
|
||||
This polecat has unpushed work that will be lost if nuked.
|
||||
Please coordinate recovery before authorizing cleanup."
|
||||
```
|
||||
|
||||
The nuke command will block automatically:
|
||||
```bash
|
||||
$ gt polecat nuke {{RIG}}/<name>
|
||||
Error: The following polecats have unpushed/uncommitted work:
|
||||
- {{RIG}}/<name>
|
||||
|
||||
These polecats NEED RECOVERY before cleanup.
|
||||
Options:
|
||||
1. Escalate to Mayor: gt mail send mayor/ -s "RECOVERY_NEEDED" -m "..."
|
||||
2. Force nuke (LOSES WORK): gt polecat nuke --force {{RIG}}/<name>
|
||||
```
|
||||
|
||||
Only use `--force` after Mayor authorizes or confirms work is unrecoverable.
|
||||
|
||||
---
|
||||
|
||||
## Pre-Kill Verification Checklist
|
||||
|
||||
Before killing ANY polecat session, verify:
|
||||
|
||||
```
|
||||
[ ] 1. gt polecat git-state <name> # Must be clean
|
||||
[ ] 2. Check for uncommitted work:
|
||||
cd polecats/<name> && git status
|
||||
[ ] 3. Check for unpushed commits:
|
||||
git log origin/main..HEAD
|
||||
[ ] 4. Verify issue closed:
|
||||
[ ] 1. gt polecat check-recovery {{RIG}}/<name> # Must be SAFE_TO_NUKE
|
||||
[ ] 2. gt polecat git-state <name> # Must be clean
|
||||
[ ] 3. Verify issue closed:
|
||||
bd show <issue-id> # Should show 'closed'
|
||||
[ ] 5. Verify PR submitted (if applicable):
|
||||
[ ] 4. Verify PR submitted (if applicable):
|
||||
Check merge queue or PR status
|
||||
```
|
||||
|
||||
**If git state is dirty:**
|
||||
**If NEEDS_RECOVERY:**
|
||||
1. Send RECOVERY_NEEDED escalation to Mayor (see above)
|
||||
2. Wait for Mayor authorization
|
||||
3. Do NOT proceed with nuke
|
||||
|
||||
**If git state dirty but polecat still alive:**
|
||||
1. Nudge the worker to clean up
|
||||
2. Wait 5 minutes for response
|
||||
3. If still dirty after 3 attempts → Escalate to Mayor
|
||||
|
||||
**If all checks pass:**
|
||||
**If SAFE_TO_NUKE and all checks pass:**
|
||||
1. **Send MERGE_READY to refinery** (CRITICAL - do this BEFORE killing):
|
||||
```bash
|
||||
gt mail send {{RIG}}/refinery -s "MERGE_READY <polecat>" -m "Branch: <branch>
|
||||
@@ -65,6 +110,7 @@ Before killing ANY polecat session, verify:
|
||||
```
|
||||
NOTE: Use `gt polecat nuke` instead of raw git commands. It knows the correct
|
||||
worktree parent repo (mayor/rig or .repo.git) and handles cleanup properly.
|
||||
The nuke will automatically block if cleanup_status indicates unpushed work.
|
||||
3. **Notify Mayor** (for tracking):
|
||||
```bash
|
||||
gt mail send mayor/ -s "Polecat <name> processed" -m "Work: <issue>
|
||||
@@ -78,7 +124,10 @@ Before killing ANY polecat session, verify:
|
||||
```bash
|
||||
# Polecat management
|
||||
gt polecat list {{RIG}} # See all polecats
|
||||
gt polecat git-state <name> # Check git cleanliness
|
||||
gt polecat check-recovery {{RIG}}/<name> # Check if safe to nuke
|
||||
gt polecat git-state {{RIG}}/<name> # Check git cleanliness
|
||||
gt polecat nuke {{RIG}}/<name> # Nuke (blocks on unpushed work)
|
||||
gt polecat nuke --force {{RIG}}/<name> # Force nuke (LOSES WORK)
|
||||
|
||||
# Session inspection
|
||||
tmux capture-pane -t gt-{{RIG}}-<name> -p | tail -40
|
||||
@@ -91,12 +140,15 @@ gt mail inbox
|
||||
gt mail read <id>
|
||||
gt mail send mayor/ -s "Subject" -m "Message"
|
||||
gt mail send {{RIG}}/refinery -s "MERGE_READY <polecat>" -m "..."
|
||||
gt mail send mayor/ -s "RECOVERY_NEEDED {{RIG}}/<polecat>" -m "..." # Escalate
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Do NOT
|
||||
|
||||
- **Nuke polecats with unpushed work** - always check-recovery first
|
||||
- Use `--force` without Mayor authorization
|
||||
- Kill sessions without completing pre-kill verification
|
||||
- Kill sessions without sending MERGE_READY to refinery
|
||||
- Spawn new polecats (Mayor does that)
|
||||
|
||||
Reference in New Issue
Block a user