Witness: Add recovery escalation for dormant polecats with unpushed work

- Add gt polecat check-recovery command to check cleanup_status from agent bead - Update gt polecat nuke to block on polecats with unpushed work (use --force to override) - Add EscalateRecoveryNeeded function for RECOVERY_NEEDED escalations - Update Witness template with dormant polecat recovery protocol Prevents accidental data loss when cleaning up dormant polecats that have unpushed commits. The Witness should now use check-recovery before nuking and escalate NEEDS_RECOVERY cases to the Mayor. (gt-cloml) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 20:59:46 -08:00
parent d006b20d7c
commit aed2482d88
3 changed files with 314 additions and 15 deletions
@@ -202,6 +202,8 @@ var (
 	polecatGCDryRun           bool
 	polecatNukeAll            bool
 	polecatNukeDryRun         bool
+	polecatNukeForce          bool
+	polecatCheckRecoveryJSON  bool
 )

 var polecatGCCmd = &cobra.Command{
@@ -266,6 +268,25 @@ Examples:
 	RunE: runPolecatGitState,
 }

+var polecatCheckRecoveryCmd = &cobra.Command{
+	Use:   "check-recovery <rig>/<polecat>",
+	Short: "Check if polecat needs recovery vs safe to nuke",
+	Long: `Check recovery status of a polecat based on cleanup_status in agent bead.
+
+Used by the Witness to determine appropriate cleanup action:
+  - SAFE_TO_NUKE: cleanup_status is 'clean' - no work at risk
+  - NEEDS_RECOVERY: cleanup_status indicates unpushed/uncommitted work
+
+This prevents accidental data loss when cleaning up dormant polecats.
+The Witness should escalate NEEDS_RECOVERY cases to the Mayor.
+
+Examples:
+  gt polecat check-recovery greenplace/Toast
+  gt polecat check-recovery greenplace/Toast --json`,
+	Args: cobra.ExactArgs(1),
+	RunE: runPolecatCheckRecovery,
+}
+
 func init() {
 	// List flags
 	polecatListCmd.Flags().BoolVar(&polecatListJSON, "json", false, "Output as JSON")
@@ -291,6 +312,10 @@ func init() {
 	// Nuke flags
 	polecatNukeCmd.Flags().BoolVar(&polecatNukeAll, "all", false, "Nuke all polecats in the rig")
 	polecatNukeCmd.Flags().BoolVar(&polecatNukeDryRun, "dry-run", false, "Show what would be nuked without doing it")
+	polecatNukeCmd.Flags().BoolVarP(&polecatNukeForce, "force", "f", false, "Force nuke even if polecat has unpushed work")
+
+	// Check-recovery flags
+	polecatCheckRecoveryCmd.Flags().BoolVar(&polecatCheckRecoveryJSON, "json", false, "Output as JSON")

 	// Add subcommands
 	polecatCmd.AddCommand(polecatListCmd)
@@ -303,6 +328,7 @@ func init() {
 	polecatCmd.AddCommand(polecatSyncCmd)
 	polecatCmd.AddCommand(polecatStatusCmd)
 	polecatCmd.AddCommand(polecatGitStateCmd)
+	polecatCmd.AddCommand(polecatCheckRecoveryCmd)
 	polecatCmd.AddCommand(polecatGCCmd)
 	polecatCmd.AddCommand(polecatNukeCmd)

@@ -1054,6 +1080,122 @@ func getGitState(worktreePath string) (*GitState, error) {
 	return state, nil
 }

+// RecoveryStatus represents whether a polecat needs recovery or is safe to nuke.
+type RecoveryStatus struct {
+	Rig           string `json:"rig"`
+	Polecat       string `json:"polecat"`
+	CleanupStatus string `json:"cleanup_status"`
+	NeedsRecovery bool   `json:"needs_recovery"`
+	Verdict       string `json:"verdict"` // SAFE_TO_NUKE or NEEDS_RECOVERY
+	Branch        string `json:"branch,omitempty"`
+	Issue         string `json:"issue,omitempty"`
+}
+
+func runPolecatCheckRecovery(cmd *cobra.Command, args []string) error {
+	rigName, polecatName, err := parseAddress(args[0])
+	if err != nil {
+		return err
+	}
+
+	mgr, r, err := getPolecatManager(rigName)
+	if err != nil {
+		return err
+	}
+
+	// Verify polecat exists and get info
+	p, err := mgr.Get(polecatName)
+	if err != nil {
+		return fmt.Errorf("polecat '%s' not found in rig '%s'", polecatName, rigName)
+	}
+
+	// Get cleanup_status from agent bead
+	// We need to read it directly from beads since manager doesn't expose it
+	rigPath := r.Path
+	bd := beads.New(rigPath)
+	agentBeadID := beads.PolecatBeadID(rigName, polecatName)
+	_, fields, err := bd.GetAgentBead(agentBeadID)
+
+	status := RecoveryStatus{
+		Rig:     rigName,
+		Polecat: polecatName,
+		Branch:  p.Branch,
+		Issue:   p.Issue,
+	}
+
+	if err != nil || fields == nil {
+		// No agent bead or no cleanup_status - fall back to git check
+		// This handles polecats that haven't self-reported yet
+		gitState, gitErr := getGitState(p.ClonePath)
+		if gitErr != nil {
+			status.CleanupStatus = "unknown"
+			status.NeedsRecovery = true
+			status.Verdict = "NEEDS_RECOVERY"
+		} else if gitState.Clean {
+			status.CleanupStatus = "clean"
+			status.NeedsRecovery = false
+			status.Verdict = "SAFE_TO_NUKE"
+		} else if gitState.UnpushedCommits > 0 {
+			status.CleanupStatus = "has_unpushed"
+			status.NeedsRecovery = true
+			status.Verdict = "NEEDS_RECOVERY"
+		} else if gitState.StashCount > 0 {
+			status.CleanupStatus = "has_stash"
+			status.NeedsRecovery = true
+			status.Verdict = "NEEDS_RECOVERY"
+		} else {
+			status.CleanupStatus = "has_uncommitted"
+			status.NeedsRecovery = true
+			status.Verdict = "NEEDS_RECOVERY"
+		}
+	} else {
+		// Use cleanup_status from agent bead
+		status.CleanupStatus = fields.CleanupStatus
+		switch fields.CleanupStatus {
+		case "clean":
+			status.NeedsRecovery = false
+			status.Verdict = "SAFE_TO_NUKE"
+		case "has_uncommitted", "has_unpushed", "has_stash":
+			status.NeedsRecovery = true
+			status.Verdict = "NEEDS_RECOVERY"
+		default:
+			// Unknown or empty - be conservative
+			status.NeedsRecovery = true
+			status.Verdict = "NEEDS_RECOVERY"
+		}
+	}
+
+	// JSON output
+	if polecatCheckRecoveryJSON {
+		enc := json.NewEncoder(os.Stdout)
+		enc.SetIndent("", "  ")
+		return enc.Encode(status)
+	}
+
+	// Human-readable output
+	fmt.Printf("%s\n\n", style.Bold.Render(fmt.Sprintf("Recovery Status: %s/%s", rigName, polecatName)))
+	fmt.Printf("  Cleanup Status:  %s\n", status.CleanupStatus)
+	if status.Branch != "" {
+		fmt.Printf("  Branch:          %s\n", status.Branch)
+	}
+	if status.Issue != "" {
+		fmt.Printf("  Issue:           %s\n", status.Issue)
+	}
+	fmt.Println()
+
+	if status.NeedsRecovery {
+		fmt.Printf("  Verdict:         %s\n", style.Error.Render("NEEDS_RECOVERY"))
+		fmt.Println()
+		fmt.Printf("  %s This polecat has unpushed/uncommitted work.\n", style.Warning.Render("⚠"))
+		fmt.Println("  Escalate to Mayor for recovery before cleanup.")
+	} else {
+		fmt.Printf("  Verdict:         %s\n", style.Success.Render("SAFE_TO_NUKE"))
+		fmt.Println()
+		fmt.Printf("  %s Safe to nuke - no work at risk.\n", style.Success.Render("✓"))
+	}
+
+	return nil
+}
+
 func runPolecatGC(cmd *cobra.Command, args []string) error {
 	rigName := args[0]

@@ -1199,6 +1341,58 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
 		}
 	}

+	// Check recovery status for each polecat unless --force is set
+	// This prevents accidental data loss when nuking polecats with unpushed work
+	if !polecatNukeForce && !polecatNukeDryRun {
+		var needsRecovery []string
+		for _, p := range toNuke {
+			// Check cleanup_status from agent bead
+			bd := beads.New(p.r.Path)
+			agentBeadID := beads.PolecatBeadID(p.rigName, p.polecatName)
+			_, fields, err := bd.GetAgentBead(agentBeadID)
+
+			var recoveryNeeded bool
+			if err != nil || fields == nil {
+				// No agent bead - fall back to git check
+				polecatInfo, infoErr := p.mgr.Get(p.polecatName)
+				if infoErr == nil && polecatInfo != nil {
+					gitState, gitErr := getGitState(polecatInfo.ClonePath)
+					if gitErr != nil || !gitState.Clean {
+						recoveryNeeded = true
+					}
+				}
+			} else {
+				// Check cleanup_status from agent bead
+				switch fields.CleanupStatus {
+				case "clean":
+					recoveryNeeded = false
+				case "has_uncommitted", "has_unpushed", "has_stash", "unknown", "":
+					recoveryNeeded = true
+				default:
+					recoveryNeeded = true
+				}
+			}
+
+			if recoveryNeeded {
+				needsRecovery = append(needsRecovery, fmt.Sprintf("%s/%s", p.rigName, p.polecatName))
+			}
+		}
+
+		if len(needsRecovery) > 0 {
+			fmt.Printf("%s The following polecats have unpushed/uncommitted work:\n", style.Error.Render("Error:"))
+			for _, pc := range needsRecovery {
+				fmt.Printf("  - %s\n", pc)
+			}
+			fmt.Println()
+			fmt.Println("These polecats NEED RECOVERY before cleanup.")
+			fmt.Println("Options:")
+			fmt.Printf("  1. Escalate to Mayor: gt mail send mayor/ -s \"RECOVERY_NEEDED\" -m \"...\"\n")
+			fmt.Printf("  2. Force nuke (LOSES WORK): gt polecat nuke --force %s\n", strings.Join(needsRecovery, " "))
+			fmt.Println()
+			return fmt.Errorf("blocked: %d polecat(s) need recovery", len(needsRecovery))
+		}
+	}
+
 	// Nuke each polecat
 	t := tmux.NewTmux()
 	var nukeErrors []string
@@ -1214,7 +1408,11 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
 			continue
 		}

+		if polecatNukeForce {
+			fmt.Printf("%s Nuking %s/%s (--force)...\n", style.Warning.Render("⚠"), p.rigName, p.polecatName)
+		} else {
 			fmt.Printf("Nuking %s/%s...\n", p.rigName, p.polecatName)
+		}

 		// Step 1: Kill session (force mode - no graceful shutdown)
 		sessMgr := session.NewManager(t, p.r)
@@ -348,6 +348,55 @@ Requested at: %s`,
 	return msg.ID, nil
 }

+// RecoveryPayload contains data for RECOVERY_NEEDED escalation.
+type RecoveryPayload struct {
+	PolecatName   string
+	Rig           string
+	CleanupStatus string
+	Branch        string
+	IssueID       string
+	DetectedAt    time.Time
+}
+
+// EscalateRecoveryNeeded sends a RECOVERY_NEEDED escalation to the Mayor.
+// This is used when a dormant polecat has unpushed work that needs recovery
+// before cleanup. The Mayor should coordinate recovery (e.g., push the branch,
+// save the work) before authorizing cleanup.
+func EscalateRecoveryNeeded(router *mail.Router, rigName string, payload *RecoveryPayload) (string, error) {
+	msg := &mail.Message{
+		From:     fmt.Sprintf("%s/witness", rigName),
+		To:       "mayor/",
+		Subject:  fmt.Sprintf("RECOVERY_NEEDED %s/%s", rigName, payload.PolecatName),
+		Priority: mail.PriorityUrgent,
+		Body: fmt.Sprintf(`Polecat: %s/%s
+Cleanup Status: %s
+Branch: %s
+Issue: %s
+Detected: %s
+
+This polecat has unpushed/uncommitted work that will be lost if nuked.
+Please coordinate recovery before authorizing cleanup:
+1. Check if branch can be pushed to origin
+2. Review uncommitted changes for value
+3. Either recover the work or authorize force-nuke
+
+DO NOT nuke without --force after recovery.`,
+			rigName,
+			payload.PolecatName,
+			payload.CleanupStatus,
+			payload.Branch,
+			payload.IssueID,
+			payload.DetectedAt.Format(time.RFC3339),
+		),
+	}
+
+	if err := router.Send(msg); err != nil {
+		return "", err
+	}
+
+	return msg.ID, nil
+}
+
 // UpdateCleanupWispState updates a cleanup wisp's state label.
 func UpdateCleanupWispState(workDir, wispID, newState string) error {
 	// Get current labels to preserve other labels
@@ -30,28 +30,73 @@ Check your mail with: `gt mail inbox`

 ---

+## Dormant Polecat Recovery Protocol
+
+When checking dormant polecats, use the recovery check command:
+
+```bash
+gt polecat check-recovery {{RIG}}/<name>
+```
+
+This returns one of:
+- **SAFE_TO_NUKE**: cleanup_status is 'clean' - proceed with normal cleanup
+- **NEEDS_RECOVERY**: cleanup_status indicates unpushed/uncommitted work
+
+### If NEEDS_RECOVERY
+
+**CRITICAL: Do NOT auto-nuke polecats with unpushed work.**
+
+Instead, escalate to Mayor:
+```bash
+gt mail send mayor/ -s "RECOVERY_NEEDED {{RIG}}/<polecat>" -m "Cleanup Status: has_unpushed
+Branch: <branch-name>
+Issue: <issue-id>
+Detected: $(date -Iseconds)
+
+This polecat has unpushed work that will be lost if nuked.
+Please coordinate recovery before authorizing cleanup."
+```
+
+The nuke command will block automatically:
+```bash
+$ gt polecat nuke {{RIG}}/<name>
+Error: The following polecats have unpushed/uncommitted work:
+  - {{RIG}}/<name>
+
+These polecats NEED RECOVERY before cleanup.
+Options:
+  1. Escalate to Mayor: gt mail send mayor/ -s "RECOVERY_NEEDED" -m "..."
+  2. Force nuke (LOSES WORK): gt polecat nuke --force {{RIG}}/<name>
+```
+
+Only use `--force` after Mayor authorizes or confirms work is unrecoverable.
+
+---
+
 ## Pre-Kill Verification Checklist

 Before killing ANY polecat session, verify:

 ```
-[ ] 1. gt polecat git-state <name>    # Must be clean
-[ ] 2. Check for uncommitted work:
-       cd polecats/<name> && git status
-[ ] 3. Check for unpushed commits:
-       git log origin/main..HEAD
-[ ] 4. Verify issue closed:
+[ ] 1. gt polecat check-recovery {{RIG}}/<name>  # Must be SAFE_TO_NUKE
+[ ] 2. gt polecat git-state <name>               # Must be clean
+[ ] 3. Verify issue closed:
       bd show <issue-id>  # Should show 'closed'
-[ ] 5. Verify PR submitted (if applicable):
+[ ] 4. Verify PR submitted (if applicable):
       Check merge queue or PR status
 ```

-**If git state is dirty:**
+**If NEEDS_RECOVERY:**
+1. Send RECOVERY_NEEDED escalation to Mayor (see above)
+2. Wait for Mayor authorization
+3. Do NOT proceed with nuke
+
+**If git state dirty but polecat still alive:**
 1. Nudge the worker to clean up
 2. Wait 5 minutes for response
 3. If still dirty after 3 attempts → Escalate to Mayor

-**If all checks pass:**
+**If SAFE_TO_NUKE and all checks pass:**
 1. **Send MERGE_READY to refinery** (CRITICAL - do this BEFORE killing):
   ```bash
   gt mail send {{RIG}}/refinery -s "MERGE_READY <polecat>" -m "Branch: <branch>
@@ -65,6 +110,7 @@ Before killing ANY polecat session, verify:
   ```
   NOTE: Use `gt polecat nuke` instead of raw git commands. It knows the correct
   worktree parent repo (mayor/rig or .repo.git) and handles cleanup properly.
+   The nuke will automatically block if cleanup_status indicates unpushed work.
 3. **Notify Mayor** (for tracking):
   ```bash
   gt mail send mayor/ -s "Polecat <name> processed" -m "Work: <issue>
@@ -78,7 +124,10 @@ Before killing ANY polecat session, verify:
 ```bash
 # Polecat management
 gt polecat list {{RIG}}                # See all polecats
-gt polecat git-state <name>       # Check git cleanliness
+gt polecat check-recovery {{RIG}}/<name>  # Check if safe to nuke
+gt polecat git-state {{RIG}}/<name>    # Check git cleanliness
+gt polecat nuke {{RIG}}/<name>         # Nuke (blocks on unpushed work)
+gt polecat nuke --force {{RIG}}/<name> # Force nuke (LOSES WORK)

 # Session inspection
 tmux capture-pane -t gt-{{RIG}}-<name> -p | tail -40
@@ -91,12 +140,15 @@ gt mail inbox
 gt mail read <id>
 gt mail send mayor/ -s "Subject" -m "Message"
 gt mail send {{RIG}}/refinery -s "MERGE_READY <polecat>" -m "..."
+gt mail send mayor/ -s "RECOVERY_NEEDED {{RIG}}/<polecat>" -m "..."  # Escalate
 ```

 ---

 ## Do NOT

+- **Nuke polecats with unpushed work** - always check-recovery first
+- Use `--force` without Mayor authorization
 - Kill sessions without completing pre-kill verification
 - Kill sessions without sending MERGE_READY to refinery
 - Spawn new polecats (Mayor does that)