Witness: Verify POLECAT_DONE before stopping sessions (gt-ldk8)

- Add handling for POLECAT_DONE messages in processShutdownRequests() - Track which polecats have signaled done (using SpawnedIssues with "done:" prefix) - For LIFECYCLE:shutdown requests, wait for POLECAT_DONE before cleanup - Add checkPendingCompletions() to nudge polecats with closed issues - Add 10-minute timeout with force-kill after waiting for POLECAT_DONE - Protects against losing MR submissions when Witness cleans up too early 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-23 00:17:35 -08:00
parent a3c671188f
commit 33cdd623bc
1 changed files with 256 additions and 1 deletions
--- a/internal/witness/manager.go
+++ b/internal/witness/manager.go
@@ -361,6 +361,11 @@ func (m *Manager) checkAndProcess(w *Witness) {
 		fmt.Printf("Shutdown request error: %v\n", err)
 	}
 	// Check for polecats with closed issues that haven't signaled done
 	if err := m.checkPendingCompletions(w); err != nil {
 		fmt.Printf("Pending completions check error: %v\n", err)
 	}
 	// Auto-spawn for ready work (if enabled)
 	if w.Config.AutoSpawn {
 		if err := m.autoSpawnForReadyWork(w); err != nil {
@@ -639,7 +644,48 @@ func (m *Manager) processShutdownRequests(w *Witness) error {
 	}
 	for _, msg := range messages {
-		// Look for LIFECYCLE requests
+		// Handle POLECAT_DONE messages (polecat has completed work and is ready for cleanup)
 		if strings.HasPrefix(msg.Subject, "POLECAT_DONE ") {
 			polecatName := extractPolecatNameFromDone(msg.Subject)
 			if polecatName == "" {
 				fmt.Printf("Warning: could not extract polecat name from POLECAT_DONE message\n")
 				m.ackMessage(msg.ID)
 				continue
 			}
 			fmt.Printf("Processing POLECAT_DONE from %s\n", polecatName)
 			// Record that this polecat has signaled done
 			m.recordDone(w, polecatName)
 			// Verify polecat state before cleanup
 			if err := m.verifyPolecatState(polecatName); err != nil {
 				fmt.Printf("  Verification failed: %v\n", err)
 				// Send nudge to polecat to fix state
 				if err := m.sendNudge(polecatName, err.Error()); err != nil {
 					fmt.Printf("  Warning: failed to send nudge: %v\n", err)
 				}
 				// Don't ack message - will retry on next check
 				continue
 			}
 			// Perform cleanup
 			if err := m.cleanupPolecat(polecatName); err != nil {
 				fmt.Printf("  Cleanup error: %v\n", err)
 				// Don't ack message on error - will retry
 				continue
 			}
 			fmt.Printf("  Cleanup complete\n")
 			// Acknowledge the message
 			m.ackMessage(msg.ID)
 			continue
 		}
 		// Handle LIFECYCLE shutdown requests (legacy/Deacon-managed)
 		if strings.Contains(msg.Subject, "LIFECYCLE:") && strings.Contains(msg.Subject, "shutdown") {
 			fmt.Printf("Processing shutdown request: %s\n", msg.Subject)
@@ -653,6 +699,19 @@ func (m *Manager) processShutdownRequests(w *Witness) error {
 			fmt.Printf("  Polecat: %s\n", polecatName)
 			// SAFETY: Only cleanup if polecat has sent POLECAT_DONE
 			if !m.hasSentDone(w, polecatName) {
 				fmt.Printf("  Waiting for POLECAT_DONE from %s before cleanup\n", polecatName)
 				// Send reminder to polecat to complete shutdown sequence
 				if err := m.sendNudge(polecatName, "Please run 'gt done' to signal completion"); err != nil {
 					fmt.Printf("  Warning: failed to send nudge: %v\n", err)
 				}
 				// Don't ack message - will retry on next check
 				continue
 			}
 			// Verify polecat state before cleanup
 			if err := m.verifyPolecatState(polecatName); err != nil {
 				fmt.Printf("  Verification failed: %v\n", err)
@@ -801,6 +860,202 @@ func extractPolecatName(body string) string {
 	return ""
 }
 // extractPolecatNameFromDone extracts the polecat name from a POLECAT_DONE subject.
 // Subject format: "POLECAT_DONE {name}"
 func extractPolecatNameFromDone(subject string) string {
 	const prefix = "POLECAT_DONE "
 	if strings.HasPrefix(subject, prefix) {
 		return strings.TrimSpace(subject[len(prefix):])
 	}
 	return ""
 }
 // recordDone records that a polecat has sent POLECAT_DONE.
 // Uses SpawnedIssues with "done:" prefix to track.
 func (m *Manager) recordDone(w *Witness, polecatName string) {
 	doneKey := "done:" + polecatName
 	// Don't record duplicates
 	for _, entry := range w.SpawnedIssues {
 		if entry == doneKey {
 			return
 		}
 	}
 	w.SpawnedIssues = append(w.SpawnedIssues, doneKey)
 	_ = m.saveState(w)
 }
 // hasSentDone checks if a polecat has sent POLECAT_DONE.
 func (m *Manager) hasSentDone(w *Witness, polecatName string) bool {
 	doneKey := "done:" + polecatName
 	for _, entry := range w.SpawnedIssues {
 		if entry == doneKey {
 			return true
 		}
 	}
 	return false
 }
 // PendingCompletionTimeout is how long to wait for POLECAT_DONE after issue is closed
 // before force-killing the polecat session.
 const PendingCompletionTimeout = 10 * time.Minute
 // checkPendingCompletions checks for polecats with closed issues that haven't sent POLECAT_DONE.
 // It nudges them to complete, and force-kills after timeout.
 func (m *Manager) checkPendingCompletions(w *Witness) error {
 	polecatMgr := polecat.NewManager(m.rig, git.NewGit(m.rig.Path))
 	polecats, err := polecatMgr.List()
 	if err != nil {
 		return fmt.Errorf("listing polecats: %w", err)
 	}
 	t := tmux.NewTmux()
 	sessMgr := session.NewManager(t, m.rig)
 	for _, p := range polecats {
 		// Skip if not running
 		running, _ := sessMgr.IsRunning(p.Name)
 		if !running {
 			continue
 		}
 		// Skip if already signaled done
 		if m.hasSentDone(w, p.Name) {
 			continue
 		}
 		// Check if the polecat's issue is closed
 		issueID := m.getPolecatIssue(p.Name, p.ClonePath)
 		if issueID == "" {
 			continue
 		}
 		closed, err := m.isIssueClosed(issueID)
 		if err != nil || !closed {
 			continue
 		}
 		// Issue is closed but polecat hasn't sent POLECAT_DONE
 		waitKey := "waiting:" + p.Name
 		waitingSince := m.getWaitingTimestamp(w, waitKey)
 		if waitingSince.IsZero() {
 			// First detection - record timestamp and nudge
 			fmt.Printf("Issue %s is closed but polecat %s hasn't signaled done\n", issueID, p.Name)
 			m.recordWaiting(w, waitKey)
 			if err := m.sendNudge(p.Name, "Your issue is closed. Please run 'gt done' to complete shutdown."); err != nil {
 				fmt.Printf("  Warning: failed to send nudge: %v\n", err)
 			}
 		} else if time.Since(waitingSince) > PendingCompletionTimeout {
 			// Timeout reached - force cleanup
 			fmt.Printf("Timeout waiting for POLECAT_DONE from %s, force cleaning up\n", p.Name)
 			// Verify state first (this still protects uncommitted work)
 			if err := m.verifyPolecatState(p.Name); err != nil {
 				fmt.Printf("  Cannot force cleanup - %v\n", err)
 				// Escalate to Mayor
 				m.escalateToMayor(p.Name)
 				continue
 			}
 			if err := m.cleanupPolecat(p.Name); err != nil {
 				fmt.Printf("  Force cleanup failed: %v\n", err)
 				continue
 			}
 			// Clean up tracking
 			m.clearWaiting(w, waitKey)
 		} else {
 			// Still waiting
 			elapsed := time.Since(waitingSince).Round(time.Minute)
 			remaining := (PendingCompletionTimeout - time.Since(waitingSince)).Round(time.Minute)
 			fmt.Printf("Waiting for POLECAT_DONE from %s (elapsed: %v, timeout in: %v)\n",
 				p.Name, elapsed, remaining)
 		}
 	}
 	return nil
 }
 // getPolecatIssue tries to determine which issue a polecat is working on.
 func (m *Manager) getPolecatIssue(polecatName, polecatPath string) string {
 	// Try to read from state file
 	stateFile := filepath.Join(polecatPath, ".runtime", "state.json")
 	data, err := os.ReadFile(stateFile)
 	if err != nil {
 		return ""
 	}
 	var state struct {
 		IssueID string `json:"issue_id"`
 	}
 	if err := json.Unmarshal(data, &state); err != nil {
 		return ""
 	}
 	return state.IssueID
 }
 // isIssueClosed checks if an issue is closed.
 func (m *Manager) isIssueClosed(issueID string) (bool, error) {
 	cmd := exec.Command("bd", "show", issueID, "--json")
 	cmd.Dir = m.workDir
 	var stdout, stderr bytes.Buffer
 	cmd.Stdout = &stdout
 	cmd.Stderr = &stderr
 	if err := cmd.Run(); err != nil {
 		return false, fmt.Errorf("%s", stderr.String())
 	}
 	// Parse to check status
 	var issues []struct {
 		Status string `json:"status"`
 	}
 	if err := json.Unmarshal(stdout.Bytes(), &issues); err != nil {
 		return false, err
 	}
 	if len(issues) == 0 {
 		return false, nil
 	}
 	return issues[0].Status == "closed", nil
 }
 // getWaitingTimestamp retrieves when we started waiting for a polecat.
 func (m *Manager) getWaitingTimestamp(w *Witness, key string) time.Time {
 	// Parse timestamps from SpawnedIssues with "waiting:{name}:{timestamp}" format
 	for _, entry := range w.SpawnedIssues {
 		if strings.HasPrefix(entry, key+":") {
 			tsStr := entry[len(key)+1:]
 			if ts, err := time.Parse(time.RFC3339, tsStr); err == nil {
 				return ts
 			}
 		}
 	}
 	return time.Time{}
 }
 // recordWaiting records when we started waiting for a polecat to complete.
 func (m *Manager) recordWaiting(w *Witness, key string) {
 	entry := fmt.Sprintf("%s:%s", key, time.Now().Format(time.RFC3339))
 	w.SpawnedIssues = append(w.SpawnedIssues, entry)
 	_ = m.saveState(w)
 }
 // clearWaiting removes the waiting timestamp for a polecat.
 func (m *Manager) clearWaiting(w *Witness, key string) {
 	var filtered []string
 	for _, entry := range w.SpawnedIssues {
 		if !strings.HasPrefix(entry, key) {
 			filtered = append(filtered, entry)
 		}
 	}
 	w.SpawnedIssues = filtered
 	_ = m.saveState(w)
 }
 // cleanupPolecat performs the full cleanup sequence for a transient polecat.
 // 1. Check for uncommitted work (stubbornly refuses to lose work)
 // 2. Kill session