Files
gastown/internal/witness/manager.go
Steve Yegge 2afc72dd86 refactor: remove molecule logic from witness Go code
Remove embedded molecule management from witness - this logic belongs
in molecule definitions (YAML + hooks), not Go code.

Removed:
- WitnessHandoffState, WorkerState types
- Handoff bead management (load/save/ensure)
- Patrol instance creation (ensurePatrolInstance)
- Polecat arm tracking (ensurePolecatArm, closePolecatArm)
- updateWorkerActivity function

Simplified:
- Nudge tracking now uses only SpawnedIssues (in-memory)
- run() no longer loads handoff state or creates patrol instances
- healthCheck() no longer manages tracking arms

Fixed:
- escalateToMayor: bd mail send → gt mail send
- ackMessage: bd mail ack → gt mail archive

The witness now does its core job (health checks, nudges, escalation,
cleanup) without trying to manage molecule state. Molecule tracking
should be handled by the molecule system itself via bd mol commands.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-23 23:02:20 -08:00

1137 lines
31 KiB
Go

package witness
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"
"github.com/steveyegge/gastown/internal/git"
"github.com/steveyegge/gastown/internal/polecat"
"github.com/steveyegge/gastown/internal/rig"
"github.com/steveyegge/gastown/internal/session"
"github.com/steveyegge/gastown/internal/tmux"
)
// Common errors
var (
ErrNotRunning = errors.New("witness not running")
ErrAlreadyRunning = errors.New("witness already running")
)
// Manager handles witness lifecycle and monitoring operations.
type Manager struct {
rig *rig.Rig
workDir string
}
// NewManager creates a new witness manager for a rig.
func NewManager(r *rig.Rig) *Manager {
return &Manager{
rig: r,
workDir: r.Path,
}
}
// stateFile returns the path to the witness state file.
func (m *Manager) stateFile() string {
return filepath.Join(m.rig.Path, ".runtime", "witness.json")
}
// loadState loads witness state from disk.
func (m *Manager) loadState() (*Witness, error) {
data, err := os.ReadFile(m.stateFile())
if err != nil {
if os.IsNotExist(err) {
return &Witness{
RigName: m.rig.Name,
State: StateStopped,
}, nil
}
return nil, err
}
var w Witness
if err := json.Unmarshal(data, &w); err != nil {
return nil, err
}
return &w, nil
}
// saveState persists witness state to disk.
func (m *Manager) saveState(w *Witness) error {
dir := filepath.Dir(m.stateFile())
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}
data, err := json.MarshalIndent(w, "", " ")
if err != nil {
return err
}
return os.WriteFile(m.stateFile(), data, 0644)
}
// Status returns the current witness status.
func (m *Manager) Status() (*Witness, error) {
w, err := m.loadState()
if err != nil {
return nil, err
}
// If running, verify process is still alive
if w.State == StateRunning && w.PID > 0 {
if !processExists(w.PID) {
w.State = StateStopped
w.PID = 0
_ = m.saveState(w)
}
}
// Update monitored polecats list
w.MonitoredPolecats = m.rig.Polecats
return w, nil
}
// Start starts the witness.
// If foreground is true, runs in the current process (blocking).
// Otherwise, spawns a background process.
func (m *Manager) Start(foreground bool) error {
w, err := m.loadState()
if err != nil {
return err
}
if w.State == StateRunning && w.PID > 0 && processExists(w.PID) {
return ErrAlreadyRunning
}
now := time.Now()
w.State = StateRunning
w.StartedAt = &now
w.PID = os.Getpid() // For foreground mode; background would set actual PID
w.MonitoredPolecats = m.rig.Polecats
if err := m.saveState(w); err != nil {
return err
}
if foreground {
// Run the monitoring loop (blocking)
return m.run(w)
}
// Background mode: spawn a new process
// For MVP, we just mark as running - actual daemon implementation later
return nil
}
// Stop stops the witness.
func (m *Manager) Stop() error {
w, err := m.loadState()
if err != nil {
return err
}
if w.State != StateRunning {
return ErrNotRunning
}
// If we have a PID, try to stop it gracefully
if w.PID > 0 && w.PID != os.Getpid() {
// Send SIGTERM
if proc, err := os.FindProcess(w.PID); err == nil {
_ = proc.Signal(os.Interrupt)
}
}
w.State = StateStopped
w.PID = 0
return m.saveState(w)
}
// run is the main monitoring loop (for foreground mode).
func (m *Manager) run(w *Witness) error {
fmt.Println("Witness running...")
fmt.Println("Press Ctrl+C to stop")
// Initial check immediately
m.checkAndProcess(w)
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for range ticker.C {
m.checkAndProcess(w)
}
return nil
}
// checkAndProcess performs health check, shutdown processing, and auto-spawn.
func (m *Manager) checkAndProcess(w *Witness) {
// Perform health check
if err := m.healthCheck(w); err != nil {
fmt.Printf("Health check error: %v\n", err)
}
// Check for shutdown requests
if err := m.processShutdownRequests(w); err != nil {
fmt.Printf("Shutdown request error: %v\n", err)
}
// Check for polecats with closed issues that haven't signaled done
if err := m.checkPendingCompletions(w); err != nil {
fmt.Printf("Pending completions check error: %v\n", err)
}
// Auto-spawn for ready work (if enabled)
if w.Config.AutoSpawn {
if err := m.autoSpawnForReadyWork(w); err != nil {
fmt.Printf("Auto-spawn error: %v\n", err)
}
}
}
// healthCheck performs a health check on all monitored polecats.
func (m *Manager) healthCheck(w *Witness) error {
now := time.Now()
w.LastCheckAt = &now
w.Stats.TotalChecks++
w.Stats.TodayChecks++
// List polecats
polecatMgr := polecat.NewManager(m.rig, git.NewGit(m.rig.Path))
polecats, err := polecatMgr.List()
if err != nil {
return fmt.Errorf("listing polecats: %w", err)
}
t := tmux.NewTmux()
sessMgr := session.NewManager(t, m.rig)
// Update monitored polecats list
var active []string
for _, p := range polecats {
running, _ := sessMgr.IsRunning(p.Name)
if running {
active = append(active, p.Name)
// Check health of each active polecat
status := m.checkPolecatHealth(p.Name, p.ClonePath)
if status == PolecatStuck {
m.handleStuckPolecat(w, p.Name)
} else if status == PolecatHealthy {
// Worker is active - clear nudge count
m.clearNudgeCount(w, p.Name)
}
}
}
w.MonitoredPolecats = active
return m.saveState(w)
}
// PolecatHealthStatus represents the health status of a polecat.
type PolecatHealthStatus int
const (
// PolecatHealthy means the polecat is working normally.
PolecatHealthy PolecatHealthStatus = iota
// PolecatStuck means the polecat has no recent activity.
PolecatStuck
// PolecatDead means the polecat session is not responding.
PolecatDead
)
// StuckThresholdMinutes is the default time without activity before a polecat is considered stuck.
const StuckThresholdMinutes = 30
// checkPolecatHealth checks if a polecat is healthy based on recent activity.
func (m *Manager) checkPolecatHealth(name, path string) PolecatHealthStatus {
threshold := time.Duration(StuckThresholdMinutes) * time.Minute
// Check 1: Git activity (most reliable indicator of work)
gitPath := filepath.Join(path, ".git")
if info, err := os.Stat(gitPath); err == nil {
if time.Since(info.ModTime()) < threshold {
return PolecatHealthy
}
}
// Check 2: State file activity
stateFile := filepath.Join(path, ".runtime", "state.json")
if info, err := os.Stat(stateFile); err == nil {
if time.Since(info.ModTime()) < threshold {
return PolecatHealthy
}
}
// Check 3: Any file modification in the polecat directory
latestMod := m.getLatestModTime(path)
if !latestMod.IsZero() && time.Since(latestMod) < threshold {
return PolecatHealthy
}
return PolecatStuck
}
// getLatestModTime finds the most recent modification time in a directory.
func (m *Manager) getLatestModTime(dir string) time.Time {
var latest time.Time
// Quick check: just look at a few key locations
locations := []string{
filepath.Join(dir, ".git", "logs", "HEAD"),
filepath.Join(dir, ".git", "index"),
filepath.Join(dir, ".beads", "issues.jsonl"),
}
for _, loc := range locations {
if info, err := os.Stat(loc); err == nil {
if info.ModTime().After(latest) {
latest = info.ModTime()
}
}
}
return latest
}
// handleStuckPolecat handles a polecat that appears to be stuck.
func (m *Manager) handleStuckPolecat(w *Witness, polecatName string) {
fmt.Printf("Polecat %s appears stuck (no activity for %d minutes)\n",
polecatName, StuckThresholdMinutes)
// Check nudge history for this polecat
nudgeCount := m.getNudgeCount(w, polecatName)
if nudgeCount == 0 {
// First stuck detection: send a nudge
fmt.Printf(" Sending nudge to %s...\n", polecatName)
if err := m.sendNudge(polecatName, "No activity detected. Are you still working?"); err != nil {
fmt.Printf(" Warning: failed to send nudge: %v\n", err)
}
m.recordNudge(w, polecatName)
w.Stats.TotalNudges++
w.Stats.TodayNudges++
} else if nudgeCount == 1 {
// Second stuck detection: escalate to Mayor
fmt.Printf(" Escalating %s to Mayor (no response to nudge)...\n", polecatName)
if err := m.escalateToMayor(polecatName); err != nil {
fmt.Printf(" Warning: failed to escalate: %v\n", err)
}
w.Stats.TotalEscalations++
m.recordNudge(w, polecatName)
} else {
// Third+ stuck detection: log but wait for human confirmation
fmt.Printf(" %s still stuck (waiting for human intervention)\n", polecatName)
}
}
// getNudgeCount returns how many times a polecat has been nudged.
// Uses SpawnedIssues for tracking.
func (m *Manager) getNudgeCount(w *Witness, polecatName string) int {
count := 0
nudgeKey := "nudge:" + polecatName
for _, entry := range w.SpawnedIssues {
if entry == nudgeKey {
count++
}
}
return count
}
// recordNudge records that a nudge was sent to a polecat.
func (m *Manager) recordNudge(w *Witness, polecatName string) {
nudgeKey := "nudge:" + polecatName
w.SpawnedIssues = append(w.SpawnedIssues, nudgeKey)
}
// clearNudgeCount clears the nudge count for a polecat (e.g., when they become active again).
func (m *Manager) clearNudgeCount(w *Witness, polecatName string) {
nudgeKey := "nudge:" + polecatName
var filtered []string
for _, entry := range w.SpawnedIssues {
if entry != nudgeKey {
filtered = append(filtered, entry)
}
}
w.SpawnedIssues = filtered
}
// escalateToMayor sends an escalation message to the Mayor.
func (m *Manager) escalateToMayor(polecatName string) error {
subject := fmt.Sprintf("ESCALATION: Polecat %s stuck", polecatName)
body := fmt.Sprintf(`Polecat %s in rig %s appears stuck.
This polecat has been unresponsive for over %d minutes despite nudging.
Recommended actions:
1. Check 'gt session attach %s/%s' to see current state
2. If truly stuck, run 'gt session stop %s/%s' to kill the session
3. Investigate root cause
Rig: %s
Time: %s
`, polecatName, m.rig.Name, StuckThresholdMinutes*2,
m.rig.Name, polecatName,
m.rig.Name, polecatName,
m.rig.Name, time.Now().Format(time.RFC3339))
cmd := exec.Command("gt", "mail", "send", "mayor/",
"-s", subject,
"-m", body,
)
cmd.Dir = m.workDir
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("%w: %s", err, string(out))
}
return nil
}
// processShutdownRequests checks mail for lifecycle requests and handles them.
func (m *Manager) processShutdownRequests(w *Witness) error {
// Get witness mailbox via gt mail
messages, err := m.getWitnessMessages()
if err != nil {
return fmt.Errorf("getting messages: %w", err)
}
for _, msg := range messages {
// Handle POLECAT_DONE messages (polecat has completed work and is ready for cleanup)
if strings.HasPrefix(msg.Subject, "POLECAT_DONE ") {
polecatName := extractPolecatNameFromDone(msg.Subject)
if polecatName == "" {
fmt.Printf("Warning: could not extract polecat name from POLECAT_DONE message\n")
m.ackMessage(msg.ID)
continue
}
fmt.Printf("Processing POLECAT_DONE from %s\n", polecatName)
// Record that this polecat has signaled done
m.recordDone(w, polecatName)
// Verify polecat state before cleanup
if err := m.verifyPolecatState(polecatName); err != nil {
fmt.Printf(" Verification failed: %v\n", err)
// Send nudge to polecat to fix state
if err := m.sendNudge(polecatName, err.Error()); err != nil {
fmt.Printf(" Warning: failed to send nudge: %v\n", err)
}
// Don't ack message - will retry on next check
continue
}
// Perform cleanup
if err := m.cleanupPolecat(polecatName); err != nil {
fmt.Printf(" Cleanup error: %v\n", err)
// Don't ack message on error - will retry
continue
}
fmt.Printf(" Cleanup complete\n")
// Acknowledge the message
m.ackMessage(msg.ID)
continue
}
// Handle LIFECYCLE shutdown requests (legacy/Deacon-managed)
if strings.Contains(msg.Subject, "LIFECYCLE:") && strings.Contains(msg.Subject, "shutdown") {
fmt.Printf("Processing shutdown request: %s\n", msg.Subject)
// Extract polecat name from message body
polecatName := extractPolecatName(msg.Body)
if polecatName == "" {
fmt.Printf(" Warning: could not extract polecat name from message\n")
m.ackMessage(msg.ID)
continue
}
fmt.Printf(" Polecat: %s\n", polecatName)
// SAFETY: Only cleanup if polecat has sent POLECAT_DONE
if !m.hasSentDone(w, polecatName) {
fmt.Printf(" Waiting for POLECAT_DONE from %s before cleanup\n", polecatName)
// Send reminder to polecat to complete shutdown sequence
if err := m.sendNudge(polecatName, "Please run 'gt done' to signal completion"); err != nil {
fmt.Printf(" Warning: failed to send nudge: %v\n", err)
}
// Don't ack message - will retry on next check
continue
}
// Verify polecat state before cleanup
if err := m.verifyPolecatState(polecatName); err != nil {
fmt.Printf(" Verification failed: %v\n", err)
// Send nudge to polecat
if err := m.sendNudge(polecatName, err.Error()); err != nil {
fmt.Printf(" Warning: failed to send nudge: %v\n", err)
}
// Don't ack message - will retry on next check
continue
}
// Perform cleanup
if err := m.cleanupPolecat(polecatName); err != nil {
fmt.Printf(" Cleanup error: %v\n", err)
// Don't ack message on error - will retry
continue
}
fmt.Printf(" Cleanup complete\n")
// Acknowledge the message
m.ackMessage(msg.ID)
}
}
return nil
}
// verifyPolecatState checks that a polecat is safe to clean up.
func (m *Manager) verifyPolecatState(polecatName string) error {
polecatPath := filepath.Join(m.rig.Path, "polecats", polecatName)
// Check if polecat directory exists
if _, err := os.Stat(polecatPath); os.IsNotExist(err) {
// Already cleaned up, that's fine
return nil
}
// 1. Check git status is clean
polecatGit := git.NewGit(polecatPath)
status, err := polecatGit.Status()
if err != nil {
return fmt.Errorf("checking git status: %w", err)
}
if !status.Clean {
return fmt.Errorf("git working tree is not clean")
}
// Note: beads changes would be reflected in git status above,
// since beads files are tracked in git.
// 2. Check that the polecat branch was pushed to remote
// This catches the case where a polecat closes an issue without pushing their work.
// Without this check, work can be lost when the polecat worktree is cleaned up.
branchName := "polecat/" + polecatName
pushed, unpushedCount, err := polecatGit.BranchPushedToRemote(branchName, "origin")
if err != nil {
// Log but don't fail - could be network issue
fmt.Printf(" Warning: could not verify branch push status: %v\n", err)
} else if !pushed {
return fmt.Errorf("branch %s has %d unpushed commit(s) - run 'git push origin %s' before closing",
branchName, unpushedCount, branchName)
}
return nil
}
// sendNudge sends a message to a polecat asking it to fix its state.
func (m *Manager) sendNudge(polecatName, reason string) error {
subject := fmt.Sprintf("NUDGE: Cannot shutdown - %s", reason)
body := fmt.Sprintf(`Your shutdown request was denied because: %s
Please fix the issue and run 'gt handoff' again.
Polecat: %s
Rig: %s
Time: %s
`, reason, polecatName, m.rig.Name, time.Now().Format(time.RFC3339))
// Send via gt mail
recipient := fmt.Sprintf("%s/%s", m.rig.Name, polecatName)
cmd := exec.Command("gt", "mail", "send", recipient,
"-s", subject,
"-m", body,
)
cmd.Dir = m.workDir
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("%w: %s", err, string(out))
}
return nil
}
// WitnessMessage represents a mail message for the witness.
type WitnessMessage struct {
ID string `json:"id"`
Subject string `json:"subject"`
Body string `json:"body"`
From string `json:"from"`
}
// getWitnessMessages retrieves unread messages for the witness.
func (m *Manager) getWitnessMessages() ([]WitnessMessage, error) {
// Use gt mail inbox --json
cmd := exec.Command("gt", "mail", "inbox", "--json")
cmd.Dir = m.workDir
cmd.Env = append(os.Environ(), "BEADS_AGENT_NAME="+m.rig.Name+"-witness")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
// No messages is not an error
if strings.Contains(stderr.String(), "no messages") {
return nil, nil
}
return nil, fmt.Errorf("%s", stderr.String())
}
if stdout.Len() == 0 {
return nil, nil
}
var messages []WitnessMessage
if err := json.Unmarshal(stdout.Bytes(), &messages); err != nil {
// Try parsing as empty array
if strings.TrimSpace(stdout.String()) == "[]" {
return nil, nil
}
return nil, fmt.Errorf("parsing messages: %w", err)
}
return messages, nil
}
// ackMessage acknowledges a message (archives it as handled).
func (m *Manager) ackMessage(id string) {
cmd := exec.Command("gt", "mail", "archive", id)
cmd.Dir = m.workDir
_ = cmd.Run() // Ignore errors
}
// extractPolecatName extracts the polecat name from a lifecycle request body.
func extractPolecatName(body string) string {
// Look for "Polecat: <name>" pattern
re := regexp.MustCompile(`Polecat:\s*(\S+)`)
matches := re.FindStringSubmatch(body)
if len(matches) >= 2 {
return matches[1]
}
return ""
}
// extractPolecatNameFromDone extracts the polecat name from a POLECAT_DONE subject.
// Subject format: "POLECAT_DONE {name}"
func extractPolecatNameFromDone(subject string) string {
const prefix = "POLECAT_DONE "
if strings.HasPrefix(subject, prefix) {
return strings.TrimSpace(subject[len(prefix):])
}
return ""
}
// recordDone records that a polecat has sent POLECAT_DONE.
// Uses SpawnedIssues with "done:" prefix to track.
func (m *Manager) recordDone(w *Witness, polecatName string) {
doneKey := "done:" + polecatName
// Don't record duplicates
for _, entry := range w.SpawnedIssues {
if entry == doneKey {
return
}
}
w.SpawnedIssues = append(w.SpawnedIssues, doneKey)
_ = m.saveState(w)
}
// hasSentDone checks if a polecat has sent POLECAT_DONE.
func (m *Manager) hasSentDone(w *Witness, polecatName string) bool {
doneKey := "done:" + polecatName
for _, entry := range w.SpawnedIssues {
if entry == doneKey {
return true
}
}
return false
}
// PendingCompletionTimeout is how long to wait for POLECAT_DONE after issue is closed
// before force-killing the polecat session.
const PendingCompletionTimeout = 10 * time.Minute
// checkPendingCompletions checks for polecats with closed issues that haven't sent POLECAT_DONE.
// It nudges them to complete, and force-kills after timeout.
func (m *Manager) checkPendingCompletions(w *Witness) error {
polecatMgr := polecat.NewManager(m.rig, git.NewGit(m.rig.Path))
polecats, err := polecatMgr.List()
if err != nil {
return fmt.Errorf("listing polecats: %w", err)
}
t := tmux.NewTmux()
sessMgr := session.NewManager(t, m.rig)
for _, p := range polecats {
// Skip if not running
running, _ := sessMgr.IsRunning(p.Name)
if !running {
continue
}
// Skip if already signaled done
if m.hasSentDone(w, p.Name) {
continue
}
// Check if the polecat's issue is closed
issueID := m.getPolecatIssue(p.Name, p.ClonePath)
if issueID == "" {
continue
}
closed, err := m.isIssueClosed(issueID)
if err != nil || !closed {
continue
}
// Issue is closed but polecat hasn't sent POLECAT_DONE
waitKey := "waiting:" + p.Name
waitingSince := m.getWaitingTimestamp(w, waitKey)
if waitingSince.IsZero() {
// First detection - record timestamp and nudge
fmt.Printf("Issue %s is closed but polecat %s hasn't signaled done\n", issueID, p.Name)
m.recordWaiting(w, waitKey)
if err := m.sendNudge(p.Name, "Your issue is closed. Please run 'gt done' to complete shutdown."); err != nil {
fmt.Printf(" Warning: failed to send nudge: %v\n", err)
}
} else if time.Since(waitingSince) > PendingCompletionTimeout {
// Timeout reached - force cleanup
fmt.Printf("Timeout waiting for POLECAT_DONE from %s, force cleaning up\n", p.Name)
// Verify state first (this still protects uncommitted work)
if err := m.verifyPolecatState(p.Name); err != nil {
fmt.Printf(" Cannot force cleanup - %v\n", err)
// Escalate to Mayor
m.escalateToMayor(p.Name)
continue
}
if err := m.cleanupPolecat(p.Name); err != nil {
fmt.Printf(" Force cleanup failed: %v\n", err)
continue
}
// Clean up tracking
m.clearWaiting(w, waitKey)
} else {
// Still waiting
elapsed := time.Since(waitingSince).Round(time.Minute)
remaining := (PendingCompletionTimeout - time.Since(waitingSince)).Round(time.Minute)
fmt.Printf("Waiting for POLECAT_DONE from %s (elapsed: %v, timeout in: %v)\n",
p.Name, elapsed, remaining)
}
}
return nil
}
// getPolecatIssue tries to determine which issue a polecat is working on.
func (m *Manager) getPolecatIssue(polecatName, polecatPath string) string {
// Try to read from state file
stateFile := filepath.Join(polecatPath, ".runtime", "state.json")
data, err := os.ReadFile(stateFile)
if err != nil {
return ""
}
var state struct {
IssueID string `json:"issue_id"`
}
if err := json.Unmarshal(data, &state); err != nil {
return ""
}
return state.IssueID
}
// isIssueClosed checks if an issue is closed.
func (m *Manager) isIssueClosed(issueID string) (bool, error) {
cmd := exec.Command("bd", "show", issueID, "--json")
cmd.Dir = m.workDir
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return false, fmt.Errorf("%s", stderr.String())
}
// Parse to check status
var issues []struct {
Status string `json:"status"`
}
if err := json.Unmarshal(stdout.Bytes(), &issues); err != nil {
return false, err
}
if len(issues) == 0 {
return false, nil
}
return issues[0].Status == "closed", nil
}
// getWaitingTimestamp retrieves when we started waiting for a polecat.
func (m *Manager) getWaitingTimestamp(w *Witness, key string) time.Time {
// Parse timestamps from SpawnedIssues with "waiting:{name}:{timestamp}" format
for _, entry := range w.SpawnedIssues {
if strings.HasPrefix(entry, key+":") {
tsStr := entry[len(key)+1:]
if ts, err := time.Parse(time.RFC3339, tsStr); err == nil {
return ts
}
}
}
return time.Time{}
}
// recordWaiting records when we started waiting for a polecat to complete.
func (m *Manager) recordWaiting(w *Witness, key string) {
entry := fmt.Sprintf("%s:%s", key, time.Now().Format(time.RFC3339))
w.SpawnedIssues = append(w.SpawnedIssues, entry)
_ = m.saveState(w)
}
// clearWaiting removes the waiting timestamp for a polecat.
func (m *Manager) clearWaiting(w *Witness, key string) {
var filtered []string
for _, entry := range w.SpawnedIssues {
if !strings.HasPrefix(entry, key) {
filtered = append(filtered, entry)
}
}
w.SpawnedIssues = filtered
_ = m.saveState(w)
}
// cleanupPolecat performs the full cleanup sequence for a transient polecat.
// 1. Check for uncommitted work (stubbornly refuses to lose work)
// 2. Kill session
// 3. Remove worktree
// 4. Delete branch
//
// If the polecat has uncommitted work (changes, stashes, or unpushed commits),
// the cleanup is aborted and an error is returned. The Witness will retry later.
func (m *Manager) cleanupPolecat(polecatName string) error {
fmt.Printf(" Cleaning up polecat %s...\n", polecatName)
// Get managers
t := tmux.NewTmux()
sessMgr := session.NewManager(t, m.rig)
polecatGit := git.NewGit(m.rig.Path)
polecatMgr := polecat.NewManager(m.rig, polecatGit)
// Get polecat path for git check
polecatPath := filepath.Join(m.rig.Path, "polecats", polecatName)
// 1. Check for uncommitted work BEFORE doing anything destructive
pGit := git.NewGit(polecatPath)
status, err := pGit.CheckUncommittedWork()
if err != nil {
// If we can't check (e.g., not a git repo), log warning but continue
fmt.Printf(" Warning: could not check uncommitted work: %v\n", err)
} else if !status.Clean() {
// REFUSE to clean up - this is the key safety feature
fmt.Printf(" REFUSING to cleanup - polecat has uncommitted work:\n")
if status.HasUncommittedChanges {
fmt.Printf(" • %d uncommitted change(s)\n", len(status.ModifiedFiles)+len(status.UntrackedFiles))
}
if status.StashCount > 0 {
fmt.Printf(" • %d stash(es)\n", status.StashCount)
}
if status.UnpushedCommits > 0 {
fmt.Printf(" • %d unpushed commit(s)\n", status.UnpushedCommits)
}
return fmt.Errorf("polecat %s has uncommitted work: %s", polecatName, status.String())
}
// 2. Kill session
running, err := sessMgr.IsRunning(polecatName)
if err == nil && running {
fmt.Printf(" Killing session...\n")
if err := sessMgr.Stop(polecatName, true); err != nil {
fmt.Printf(" Warning: failed to stop session: %v\n", err)
}
}
// 3. Remove worktree (this also removes the directory)
// Use force=true since we've already verified no uncommitted work
fmt.Printf(" Removing worktree...\n")
if err := polecatMgr.RemoveWithOptions(polecatName, true, true); err != nil {
// Only error if polecat actually exists
if !errors.Is(err, polecat.ErrPolecatNotFound) {
return fmt.Errorf("removing worktree: %w", err)
}
}
// 4. Delete branch from mayor's clone
branchName := fmt.Sprintf("polecat/%s", polecatName)
mayorPath := filepath.Join(m.rig.Path, "mayor", "rig")
mayorGit := git.NewGit(mayorPath)
fmt.Printf(" Deleting branch %s...\n", branchName)
if err := mayorGit.DeleteBranch(branchName, true); err != nil {
// Branch might already be deleted or merged, not a critical error
fmt.Printf(" Warning: failed to delete branch: %v\n", err)
}
return nil
}
// processExists checks if a process with the given PID exists.
func processExists(pid int) bool {
proc, err := os.FindProcess(pid)
if err != nil {
return false
}
// On Unix, FindProcess always succeeds; signal 0 tests existence
err = proc.Signal(nil)
return err == nil
}
// ReadyIssue represents an issue from bd ready --json output.
type ReadyIssue struct {
ID string `json:"id"`
Title string `json:"title"`
Type string `json:"issue_type"`
Status string `json:"status"`
}
// autoSpawnForReadyWork spawns polecats for ready work up to capacity.
func (m *Manager) autoSpawnForReadyWork(w *Witness) error {
// Get current active polecat count
activeCount, err := m.getActivePolecatCount()
if err != nil {
return fmt.Errorf("counting polecats: %w", err)
}
maxWorkers := w.Config.MaxWorkers
if maxWorkers <= 0 {
maxWorkers = 4 // Default
}
if activeCount >= maxWorkers {
// At capacity, nothing to do
return nil
}
// Get ready issues
issues, err := m.getReadyIssues()
if err != nil {
return fmt.Errorf("getting ready issues: %w", err)
}
// Filter issues (exclude merge-requests, epics, and already-spawned issues)
var spawnableIssues []ReadyIssue
for _, issue := range issues {
// Skip merge-requests and epics
if issue.Type == "merge-request" || issue.Type == "epic" {
continue
}
// Skip if already spawned
if m.isAlreadySpawned(w, issue.ID) {
continue
}
// Filter by epic if configured
if w.Config.EpicID != "" {
isChild, err := m.isChildOfEpic(issue.ID, w.Config.EpicID)
if err != nil {
// Skip issues we can't verify - safer than including unknown work
continue
}
if !isChild {
continue
}
}
// Filter by prefix if configured
if w.Config.IssuePrefix != "" {
if !strings.HasPrefix(issue.ID, w.Config.IssuePrefix) {
continue
}
}
spawnableIssues = append(spawnableIssues, issue)
}
// Spawn up to capacity
spawnDelay := w.Config.SpawnDelayMs
if spawnDelay <= 0 {
spawnDelay = 5000 // Default 5 seconds
}
spawned := 0
for _, issue := range spawnableIssues {
if activeCount+spawned >= maxWorkers {
break
}
fmt.Printf("Auto-spawning for issue %s: %s\n", issue.ID, issue.Title)
if err := m.spawnPolecat(issue.ID); err != nil {
fmt.Printf(" Spawn failed: %v\n", err)
continue
}
// Track that we spawned for this issue
w.SpawnedIssues = append(w.SpawnedIssues, issue.ID)
spawned++
// Delay between spawns
if spawned < len(spawnableIssues) && activeCount+spawned < maxWorkers {
time.Sleep(time.Duration(spawnDelay) * time.Millisecond)
}
}
if spawned > 0 {
// Save state to persist spawned issues list
return m.saveState(w)
}
return nil
}
// getActivePolecatCount returns the number of polecats with active tmux sessions.
func (m *Manager) getActivePolecatCount() (int, error) {
polecatMgr := polecat.NewManager(m.rig, git.NewGit(m.rig.Path))
polecats, err := polecatMgr.List()
if err != nil {
return 0, err
}
t := tmux.NewTmux()
sessMgr := session.NewManager(t, m.rig)
count := 0
for _, p := range polecats {
running, _ := sessMgr.IsRunning(p.Name)
if running {
count++
}
}
return count, nil
}
// getReadyIssues returns issues ready to work (no blockers).
func (m *Manager) getReadyIssues() ([]ReadyIssue, error) {
cmd := exec.Command("bd", "ready", "--json")
cmd.Dir = m.workDir
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("%s", stderr.String())
}
if stdout.Len() == 0 {
return nil, nil
}
var issues []ReadyIssue
if err := json.Unmarshal(stdout.Bytes(), &issues); err != nil {
return nil, fmt.Errorf("parsing ready issues: %w", err)
}
return issues, nil
}
// issueDependency represents a dependency from bd show --json output.
type issueDependency struct {
ID string `json:"id"`
DependencyType string `json:"dependency_type"`
}
// issueWithDeps represents an issue with its dependencies from bd show --json.
type issueWithDeps struct {
ID string `json:"id"`
Dependents []issueDependency `json:"dependents"`
}
// isChildOfEpic checks if an issue blocks (is a child of) the given epic.
func (m *Manager) isChildOfEpic(issueID, epicID string) (bool, error) {
cmd := exec.Command("bd", "show", issueID, "--json")
cmd.Dir = m.workDir
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return false, fmt.Errorf("%s", stderr.String())
}
var issues []issueWithDeps
if err := json.Unmarshal(stdout.Bytes(), &issues); err != nil {
return false, fmt.Errorf("parsing issue: %w", err)
}
if len(issues) == 0 {
return false, nil
}
// Check if the epic is in the dependents with type "blocks"
for _, dep := range issues[0].Dependents {
if dep.ID == epicID && dep.DependencyType == "blocks" {
return true, nil
}
}
return false, nil
}
// isAlreadySpawned checks if an issue has already been spawned.
func (m *Manager) isAlreadySpawned(w *Witness, issueID string) bool {
for _, id := range w.SpawnedIssues {
if id == issueID {
return true
}
}
return false
}
// spawnPolecat spawns a polecat for an issue using gt spawn.
func (m *Manager) spawnPolecat(issueID string) error {
cmd := exec.Command("gt", "spawn", "--rig", m.rig.Name, "--issue", issueID)
cmd.Dir = m.workDir
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("%s", strings.TrimSpace(string(output)))
}
fmt.Printf(" Spawned: %s\n", strings.TrimSpace(string(output)))
return nil
}