Witness: Verify POLECAT_DONE before stopping sessions (gt-ldk8)

- Add handling for POLECAT_DONE messages in processShutdownRequests()
- Track which polecats have signaled done (using SpawnedIssues with "done:" prefix)
- For LIFECYCLE:shutdown requests, wait for POLECAT_DONE before cleanup
- Add checkPendingCompletions() to nudge polecats with closed issues
- Add 10-minute timeout with force-kill after waiting for POLECAT_DONE
- Protects against losing MR submissions when Witness cleans up too early

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-12-23 00:17:35 -08:00
parent a3c671188f
commit 33cdd623bc

View File

@@ -361,6 +361,11 @@ func (m *Manager) checkAndProcess(w *Witness) {
fmt.Printf("Shutdown request error: %v\n", err)
}
// Check for polecats with closed issues that haven't signaled done
if err := m.checkPendingCompletions(w); err != nil {
fmt.Printf("Pending completions check error: %v\n", err)
}
// Auto-spawn for ready work (if enabled)
if w.Config.AutoSpawn {
if err := m.autoSpawnForReadyWork(w); err != nil {
@@ -639,7 +644,48 @@ func (m *Manager) processShutdownRequests(w *Witness) error {
}
for _, msg := range messages {
// Look for LIFECYCLE requests
// Handle POLECAT_DONE messages (polecat has completed work and is ready for cleanup)
if strings.HasPrefix(msg.Subject, "POLECAT_DONE ") {
polecatName := extractPolecatNameFromDone(msg.Subject)
if polecatName == "" {
fmt.Printf("Warning: could not extract polecat name from POLECAT_DONE message\n")
m.ackMessage(msg.ID)
continue
}
fmt.Printf("Processing POLECAT_DONE from %s\n", polecatName)
// Record that this polecat has signaled done
m.recordDone(w, polecatName)
// Verify polecat state before cleanup
if err := m.verifyPolecatState(polecatName); err != nil {
fmt.Printf(" Verification failed: %v\n", err)
// Send nudge to polecat to fix state
if err := m.sendNudge(polecatName, err.Error()); err != nil {
fmt.Printf(" Warning: failed to send nudge: %v\n", err)
}
// Don't ack message - will retry on next check
continue
}
// Perform cleanup
if err := m.cleanupPolecat(polecatName); err != nil {
fmt.Printf(" Cleanup error: %v\n", err)
// Don't ack message on error - will retry
continue
}
fmt.Printf(" Cleanup complete\n")
// Acknowledge the message
m.ackMessage(msg.ID)
continue
}
// Handle LIFECYCLE shutdown requests (legacy/Deacon-managed)
if strings.Contains(msg.Subject, "LIFECYCLE:") && strings.Contains(msg.Subject, "shutdown") {
fmt.Printf("Processing shutdown request: %s\n", msg.Subject)
@@ -653,6 +699,19 @@ func (m *Manager) processShutdownRequests(w *Witness) error {
fmt.Printf(" Polecat: %s\n", polecatName)
// SAFETY: Only cleanup if polecat has sent POLECAT_DONE
if !m.hasSentDone(w, polecatName) {
fmt.Printf(" Waiting for POLECAT_DONE from %s before cleanup\n", polecatName)
// Send reminder to polecat to complete shutdown sequence
if err := m.sendNudge(polecatName, "Please run 'gt done' to signal completion"); err != nil {
fmt.Printf(" Warning: failed to send nudge: %v\n", err)
}
// Don't ack message - will retry on next check
continue
}
// Verify polecat state before cleanup
if err := m.verifyPolecatState(polecatName); err != nil {
fmt.Printf(" Verification failed: %v\n", err)
@@ -801,6 +860,202 @@ func extractPolecatName(body string) string {
return ""
}
// extractPolecatNameFromDone extracts the polecat name from a POLECAT_DONE subject.
// Subject format: "POLECAT_DONE {name}"
func extractPolecatNameFromDone(subject string) string {
const prefix = "POLECAT_DONE "
if strings.HasPrefix(subject, prefix) {
return strings.TrimSpace(subject[len(prefix):])
}
return ""
}
// recordDone records that a polecat has sent POLECAT_DONE.
// Uses SpawnedIssues with "done:" prefix to track.
func (m *Manager) recordDone(w *Witness, polecatName string) {
doneKey := "done:" + polecatName
// Don't record duplicates
for _, entry := range w.SpawnedIssues {
if entry == doneKey {
return
}
}
w.SpawnedIssues = append(w.SpawnedIssues, doneKey)
_ = m.saveState(w)
}
// hasSentDone checks if a polecat has sent POLECAT_DONE.
func (m *Manager) hasSentDone(w *Witness, polecatName string) bool {
doneKey := "done:" + polecatName
for _, entry := range w.SpawnedIssues {
if entry == doneKey {
return true
}
}
return false
}
// PendingCompletionTimeout is how long to wait for POLECAT_DONE after issue is closed
// before force-killing the polecat session.
const PendingCompletionTimeout = 10 * time.Minute
// checkPendingCompletions checks for polecats with closed issues that haven't sent POLECAT_DONE.
// It nudges them to complete, and force-kills after timeout.
func (m *Manager) checkPendingCompletions(w *Witness) error {
polecatMgr := polecat.NewManager(m.rig, git.NewGit(m.rig.Path))
polecats, err := polecatMgr.List()
if err != nil {
return fmt.Errorf("listing polecats: %w", err)
}
t := tmux.NewTmux()
sessMgr := session.NewManager(t, m.rig)
for _, p := range polecats {
// Skip if not running
running, _ := sessMgr.IsRunning(p.Name)
if !running {
continue
}
// Skip if already signaled done
if m.hasSentDone(w, p.Name) {
continue
}
// Check if the polecat's issue is closed
issueID := m.getPolecatIssue(p.Name, p.ClonePath)
if issueID == "" {
continue
}
closed, err := m.isIssueClosed(issueID)
if err != nil || !closed {
continue
}
// Issue is closed but polecat hasn't sent POLECAT_DONE
waitKey := "waiting:" + p.Name
waitingSince := m.getWaitingTimestamp(w, waitKey)
if waitingSince.IsZero() {
// First detection - record timestamp and nudge
fmt.Printf("Issue %s is closed but polecat %s hasn't signaled done\n", issueID, p.Name)
m.recordWaiting(w, waitKey)
if err := m.sendNudge(p.Name, "Your issue is closed. Please run 'gt done' to complete shutdown."); err != nil {
fmt.Printf(" Warning: failed to send nudge: %v\n", err)
}
} else if time.Since(waitingSince) > PendingCompletionTimeout {
// Timeout reached - force cleanup
fmt.Printf("Timeout waiting for POLECAT_DONE from %s, force cleaning up\n", p.Name)
// Verify state first (this still protects uncommitted work)
if err := m.verifyPolecatState(p.Name); err != nil {
fmt.Printf(" Cannot force cleanup - %v\n", err)
// Escalate to Mayor
m.escalateToMayor(p.Name)
continue
}
if err := m.cleanupPolecat(p.Name); err != nil {
fmt.Printf(" Force cleanup failed: %v\n", err)
continue
}
// Clean up tracking
m.clearWaiting(w, waitKey)
} else {
// Still waiting
elapsed := time.Since(waitingSince).Round(time.Minute)
remaining := (PendingCompletionTimeout - time.Since(waitingSince)).Round(time.Minute)
fmt.Printf("Waiting for POLECAT_DONE from %s (elapsed: %v, timeout in: %v)\n",
p.Name, elapsed, remaining)
}
}
return nil
}
// getPolecatIssue tries to determine which issue a polecat is working on.
func (m *Manager) getPolecatIssue(polecatName, polecatPath string) string {
// Try to read from state file
stateFile := filepath.Join(polecatPath, ".runtime", "state.json")
data, err := os.ReadFile(stateFile)
if err != nil {
return ""
}
var state struct {
IssueID string `json:"issue_id"`
}
if err := json.Unmarshal(data, &state); err != nil {
return ""
}
return state.IssueID
}
// isIssueClosed checks if an issue is closed.
func (m *Manager) isIssueClosed(issueID string) (bool, error) {
cmd := exec.Command("bd", "show", issueID, "--json")
cmd.Dir = m.workDir
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return false, fmt.Errorf("%s", stderr.String())
}
// Parse to check status
var issues []struct {
Status string `json:"status"`
}
if err := json.Unmarshal(stdout.Bytes(), &issues); err != nil {
return false, err
}
if len(issues) == 0 {
return false, nil
}
return issues[0].Status == "closed", nil
}
// getWaitingTimestamp retrieves when we started waiting for a polecat.
func (m *Manager) getWaitingTimestamp(w *Witness, key string) time.Time {
// Parse timestamps from SpawnedIssues with "waiting:{name}:{timestamp}" format
for _, entry := range w.SpawnedIssues {
if strings.HasPrefix(entry, key+":") {
tsStr := entry[len(key)+1:]
if ts, err := time.Parse(time.RFC3339, tsStr); err == nil {
return ts
}
}
}
return time.Time{}
}
// recordWaiting records when we started waiting for a polecat to complete.
func (m *Manager) recordWaiting(w *Witness, key string) {
entry := fmt.Sprintf("%s:%s", key, time.Now().Format(time.RFC3339))
w.SpawnedIssues = append(w.SpawnedIssues, entry)
_ = m.saveState(w)
}
// clearWaiting removes the waiting timestamp for a polecat.
func (m *Manager) clearWaiting(w *Witness, key string) {
var filtered []string
for _, entry := range w.SpawnedIssues {
if !strings.HasPrefix(entry, key) {
filtered = append(filtered, entry)
}
}
w.SpawnedIssues = filtered
_ = m.saveState(w)
}
// cleanupPolecat performs the full cleanup sequence for a transient polecat.
// 1. Check for uncommitted work (stubbornly refuses to lose work)
// 2. Kill session