Files
gastown/internal/refinery/manager.go
gastown/crew/mel 5218102f49 refactor(witness,refinery): ZFC-compliant state management
Remove state files from witness and refinery managers, following
the "Discover, Don't Track" principle. Tmux session existence is
now the source of truth for running state (like deacon).

Changes:
- Add IsRunning() that checks tmux HasSession
- Change Status() to return *tmux.SessionInfo
- Remove loadState/saveState/stateManager
- Simplify Start()/Stop() to not use state files
- Update CLI commands (witness/refinery/rig) for new API
- Update tests to be ZFC-compliant

This fixes state file divergence issues where witness/refinery
could show "running" when the actual tmux session was dead.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 20:00:56 -08:00

675 lines
22 KiB
Go

package refinery
import (
"errors"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/steveyegge/gastown/internal/beads"
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/events"
"github.com/steveyegge/gastown/internal/mail"
"github.com/steveyegge/gastown/internal/rig"
"github.com/steveyegge/gastown/internal/runtime"
"github.com/steveyegge/gastown/internal/session"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/util"
)
// Common errors
var (
ErrNotRunning = errors.New("refinery not running")
ErrAlreadyRunning = errors.New("refinery already running")
ErrNoQueue = errors.New("no items in queue")
)
// Manager handles refinery lifecycle and queue operations.
type Manager struct {
rig *rig.Rig
workDir string
output io.Writer // Output destination for user-facing messages
}
// NewManager creates a new refinery manager for a rig.
func NewManager(r *rig.Rig) *Manager {
return &Manager{
rig: r,
workDir: r.Path,
output: os.Stdout,
}
}
// SetOutput sets the output writer for user-facing messages.
// This is useful for testing or redirecting output.
func (m *Manager) SetOutput(w io.Writer) {
m.output = w
}
// SessionName returns the tmux session name for this refinery.
func (m *Manager) SessionName() string {
return fmt.Sprintf("gt-%s-refinery", m.rig.Name)
}
// IsRunning checks if the refinery session is active.
// ZFC: tmux session existence is the source of truth.
func (m *Manager) IsRunning() (bool, error) {
t := tmux.NewTmux()
return t.HasSession(m.SessionName())
}
// Status returns information about the refinery session.
// ZFC-compliant: tmux session is the source of truth.
func (m *Manager) Status() (*tmux.SessionInfo, error) {
t := tmux.NewTmux()
sessionID := m.SessionName()
running, err := t.HasSession(sessionID)
if err != nil {
return nil, fmt.Errorf("checking session: %w", err)
}
if !running {
return nil, ErrNotRunning
}
return t.GetSessionInfo(sessionID)
}
// Start starts the refinery.
// If foreground is true, returns an error (foreground mode deprecated).
// Otherwise, spawns a Claude agent in a tmux session to process the merge queue.
// The agentOverride parameter allows specifying an agent alias to use instead of the town default.
// ZFC-compliant: no state file, tmux session is source of truth.
func (m *Manager) Start(foreground bool, agentOverride string) error {
t := tmux.NewTmux()
sessionID := m.SessionName()
if foreground {
// Foreground mode is deprecated - the Refinery agent handles merge processing
return fmt.Errorf("foreground mode is deprecated; use background mode (remove --foreground flag)")
}
// Check if session already exists
running, _ := t.HasSession(sessionID)
if running {
// Session exists - check if Claude is actually running (healthy vs zombie)
// Use IsClaudeRunning for robust detection: Claude can report as "node", "claude",
// or version number like "2.0.76". IsAgentRunning with just "node" was too strict
// and caused healthy sessions to be killed. See: gastown#566
if t.IsClaudeRunning(sessionID) {
// Healthy - Claude is running
return ErrAlreadyRunning
}
// Zombie - tmux alive but Claude dead. Kill and recreate.
_, _ = fmt.Fprintln(m.output, "⚠ Detected zombie session (tmux alive, agent dead). Recreating...")
if err := t.KillSession(sessionID); err != nil {
return fmt.Errorf("killing zombie session: %w", err)
}
}
// Note: No PID check per ZFC - tmux session is the source of truth
// Background mode: spawn a Claude agent in a tmux session
// The Claude agent handles MR processing using git commands and beads
// Working directory is the refinery worktree (shares .git with mayor/polecats)
refineryRigDir := filepath.Join(m.rig.Path, "refinery", "rig")
if _, err := os.Stat(refineryRigDir); os.IsNotExist(err) {
// Fall back to mayor/rig (legacy architecture) - ensures we use project git, not town git.
// Using rig.Path directly would find town's .git with rig-named remotes instead of "origin".
refineryRigDir = filepath.Join(m.rig.Path, "mayor", "rig")
}
// Ensure runtime settings exist in refinery/ (not refinery/rig/) so we don't
// write into the source repo. Runtime walks up the tree to find settings.
refineryParentDir := filepath.Join(m.rig.Path, "refinery")
runtimeConfig := config.LoadRuntimeConfig(m.rig.Path)
if err := runtime.EnsureSettingsForRole(refineryParentDir, "refinery", runtimeConfig); err != nil {
return fmt.Errorf("ensuring runtime settings: %w", err)
}
// Build startup command first
townRoot := filepath.Dir(m.rig.Path)
var command string
if agentOverride != "" {
var err error
command, err = config.BuildAgentStartupCommandWithAgentOverride("refinery", m.rig.Name, townRoot, m.rig.Path, "", agentOverride)
if err != nil {
return fmt.Errorf("building startup command with agent override: %w", err)
}
} else {
command = config.BuildAgentStartupCommand("refinery", m.rig.Name, townRoot, m.rig.Path, "")
}
// Create session with command directly to avoid send-keys race condition.
// See: https://github.com/anthropics/gastown/issues/280
if err := t.NewSessionWithCommand(sessionID, refineryRigDir, command); err != nil {
return fmt.Errorf("creating tmux session: %w", err)
}
// Set environment variables (non-fatal: session works without these)
// Use centralized AgentEnv for consistency across all role startup paths
envVars := config.AgentEnv(config.AgentEnvConfig{
Role: "refinery",
Rig: m.rig.Name,
TownRoot: townRoot,
BeadsNoDaemon: true,
})
// Add refinery-specific flag
envVars["GT_REFINERY"] = "1"
// Set all env vars in tmux session (for debugging) and they'll also be exported to Claude
for k, v := range envVars {
_ = t.SetEnvironment(sessionID, k, v)
}
// Apply theme (non-fatal: theming failure doesn't affect operation)
theme := tmux.AssignTheme(m.rig.Name)
_ = t.ConfigureGasTownSession(sessionID, theme, m.rig.Name, "refinery", "refinery")
// Wait for Claude to start and show its prompt - fatal if Claude fails to launch
// WaitForRuntimeReady waits for the runtime to be ready
if err := t.WaitForRuntimeReady(sessionID, runtimeConfig, constants.ClaudeStartTimeout); err != nil {
// Kill the zombie session before returning error
_ = t.KillSessionWithProcesses(sessionID)
return fmt.Errorf("waiting for refinery to start: %w", err)
}
// Accept bypass permissions warning dialog if it appears.
_ = t.AcceptBypassPermissionsWarning(sessionID)
// Wait for runtime to be fully ready
runtime.SleepForReadyDelay(runtimeConfig)
_ = runtime.RunStartupFallback(t, sessionID, "refinery", runtimeConfig)
// Inject startup nudge for predecessor discovery via /resume
address := fmt.Sprintf("%s/refinery", m.rig.Name)
_ = session.StartupNudge(t, sessionID, session.StartupNudgeConfig{
Recipient: address,
Sender: "deacon",
Topic: "patrol",
}) // Non-fatal
// GUPP: Gas Town Universal Propulsion Principle
// Send the propulsion nudge to trigger autonomous patrol execution.
// Wait for beacon to be fully processed (needs to be separate prompt)
time.Sleep(2 * time.Second)
_ = t.NudgeSession(sessionID, session.PropulsionNudgeForRole("refinery", refineryRigDir)) // Non-fatal
return nil
}
// Stop stops the refinery.
// ZFC-compliant: tmux session is the source of truth.
func (m *Manager) Stop() error {
t := tmux.NewTmux()
sessionID := m.SessionName()
// Check if tmux session exists
running, _ := t.HasSession(sessionID)
if !running {
return ErrNotRunning
}
// Kill the tmux session
return t.KillSession(sessionID)
}
// Queue returns the current merge queue.
// Uses beads merge-request issues as the source of truth (not git branches).
// ZFC-compliant: beads is the source of truth, no state file.
func (m *Manager) Queue() ([]QueueItem, error) {
// Query beads for open merge-request type issues
// BeadsPath() returns the git-synced beads location
b := beads.New(m.rig.BeadsPath())
issues, err := b.List(beads.ListOptions{
Type: "merge-request",
Status: "open",
Priority: -1, // No priority filter
})
if err != nil {
return nil, fmt.Errorf("querying merge queue from beads: %w", err)
}
// Score and sort issues by priority score (highest first)
now := time.Now()
type scoredIssue struct {
issue *beads.Issue
score float64
}
scored := make([]scoredIssue, 0, len(issues))
for _, issue := range issues {
score := m.calculateIssueScore(issue, now)
scored = append(scored, scoredIssue{issue: issue, score: score})
}
sort.Slice(scored, func(i, j int) bool {
return scored[i].score > scored[j].score
})
// Convert scored issues to queue items
var items []QueueItem
pos := 1
for _, s := range scored {
mr := m.issueToMR(s.issue)
if mr != nil {
items = append(items, QueueItem{
Position: pos,
MR: mr,
Age: formatAge(mr.CreatedAt),
})
pos++
}
}
return items, nil
}
// calculateIssueScore computes the priority score for an MR issue.
// Higher scores mean higher priority (process first).
func (m *Manager) calculateIssueScore(issue *beads.Issue, now time.Time) float64 {
fields := beads.ParseMRFields(issue)
// Parse MR creation time
mrCreatedAt := parseTime(issue.CreatedAt)
if mrCreatedAt.IsZero() {
mrCreatedAt = now // Fallback
}
// Build score input
input := ScoreInput{
Priority: issue.Priority,
MRCreatedAt: mrCreatedAt,
Now: now,
}
// Add fields from MR metadata if available
if fields != nil {
input.RetryCount = fields.RetryCount
// Parse convoy created at if available
if fields.ConvoyCreatedAt != "" {
if convoyTime := parseTime(fields.ConvoyCreatedAt); !convoyTime.IsZero() {
input.ConvoyCreatedAt = &convoyTime
}
}
}
return ScoreMRWithDefaults(input)
}
// issueToMR converts a beads issue to a MergeRequest.
func (m *Manager) issueToMR(issue *beads.Issue) *MergeRequest {
if issue == nil {
return nil
}
// Get configured default branch for this rig
defaultBranch := m.rig.DefaultBranch()
fields := beads.ParseMRFields(issue)
if fields == nil {
// No MR fields in description, construct from title/ID
return &MergeRequest{
ID: issue.ID,
IssueID: issue.ID,
Status: MROpen,
CreatedAt: parseTime(issue.CreatedAt),
TargetBranch: defaultBranch,
}
}
// Default target to rig's default branch if not specified
target := fields.Target
if target == "" {
target = defaultBranch
}
return &MergeRequest{
ID: issue.ID,
Branch: fields.Branch,
Worker: fields.Worker,
IssueID: fields.SourceIssue,
TargetBranch: target,
Status: MROpen,
CreatedAt: parseTime(issue.CreatedAt),
}
}
// parseTime parses a time string, returning zero time on error.
func parseTime(s string) time.Time {
// Try RFC3339 first (most common)
t, err := time.Parse(time.RFC3339, s)
if err != nil {
// Try date-only format as fallback
t, _ = time.Parse("2006-01-02", s)
}
return t
}
// run is deprecated - foreground mode now just prints a message.
// The Refinery agent (Claude) handles all merge processing.
// See: ZFC #5 - Move merge/conflict decisions from Go to Refinery agent
func (m *Manager) run(_ *Refinery) error { // ref unused: deprecated function
_, _ = fmt.Fprintln(m.output, "")
_, _ = fmt.Fprintln(m.output, "╔══════════════════════════════════════════════════════════════╗")
_, _ = fmt.Fprintln(m.output, "║ Foreground mode is deprecated. ║")
_, _ = fmt.Fprintln(m.output, "║ ║")
_, _ = fmt.Fprintln(m.output, "║ The Refinery agent (Claude) handles all merge decisions. ║")
_, _ = fmt.Fprintln(m.output, "║ Use 'gt refinery start' to run in background mode. ║")
_, _ = fmt.Fprintln(m.output, "╚══════════════════════════════════════════════════════════════╝")
_, _ = fmt.Fprintln(m.output, "")
return nil
}
// MergeResult contains the result of a merge attempt.
type MergeResult struct {
Success bool
MergeCommit string // SHA of merge commit on success
Error string
Conflict bool
TestsFailed bool
}
// ProcessMR is deprecated - the Refinery agent now handles all merge processing.
//
// ZFC #5: Move merge/conflict decisions from Go to Refinery agent
//
// The agent runs git commands directly and makes decisions based on output:
// - Agent attempts merge: git checkout -b temp origin/polecat/<worker>
// - Agent detects conflict and decides: retry, notify polecat, escalate
// - Agent runs tests and decides: proceed, rollback, retry
// - Agent pushes: git push origin main
//
// This function is kept for backwards compatibility but always returns an error
// indicating that the agent should handle merge processing.
//
// Deprecated: Use the Refinery agent (Claude) for merge processing.
func (m *Manager) ProcessMR(mr *MergeRequest) MergeResult {
return MergeResult{
Error: "ProcessMR is deprecated - the Refinery agent handles merge processing (ZFC #5)",
}
}
// completeMR marks an MR as complete.
// For success, pass closeReason (e.g., CloseReasonMerged).
// For failures that should return to open, pass empty closeReason.
// ZFC-compliant: no state file, just updates MR and emits events.
// Deprecated: The Refinery agent handles merge processing (ZFC #5).
func (m *Manager) completeMR(mr *MergeRequest, closeReason CloseReason, errMsg string) {
mr.Error = errMsg
actor := fmt.Sprintf("%s/refinery", m.rig.Name)
if closeReason != "" {
// Close the MR (in_progress → closed)
if err := mr.Close(closeReason); err != nil {
// Log error but continue - this shouldn't happen
_, _ = fmt.Fprintf(m.output, "Warning: failed to close MR: %v\n", err)
}
if closeReason == CloseReasonSuperseded {
// Emit merge_skipped event
_ = events.LogFeed(events.TypeMergeSkipped, actor, events.MergePayload(mr.ID, mr.Worker, mr.Branch, "superseded"))
}
} else {
// Reopen the MR for rework (in_progress → open)
if err := mr.Reopen(); err != nil {
// Log error but continue
_, _ = fmt.Fprintf(m.output, "Warning: failed to reopen MR: %v\n", err)
}
}
}
// runTests executes the test command.
// Deprecated: The Refinery agent runs tests directly via shell commands (ZFC #5).
func (m *Manager) runTests(testCmd string) error {
parts := strings.Fields(testCmd)
if len(parts) == 0 {
return nil
}
return util.ExecRun(m.workDir, parts[0], parts[1:]...)
}
// getMergeConfig loads the merge configuration from disk.
// Returns default config if not configured.
// Deprecated: Configuration is read by the agent from settings (ZFC #5).
func (m *Manager) getMergeConfig() MergeConfig {
mergeConfig := DefaultMergeConfig()
// Check settings/config.json for merge_queue settings
settingsPath := filepath.Join(m.rig.Path, "settings", "config.json")
settings, err := config.LoadRigSettings(settingsPath)
if err != nil {
return mergeConfig
}
// Apply merge_queue config if present
if settings.MergeQueue != nil {
mq := settings.MergeQueue
mergeConfig.TestCommand = mq.TestCommand
mergeConfig.RunTests = mq.RunTests
mergeConfig.DeleteMergedBranches = mq.DeleteMergedBranches
// Note: PushRetryCount and PushRetryDelayMs use defaults if not explicitly set
}
return mergeConfig
}
// pushWithRetry pushes to the target branch with exponential backoff retry.
// Deprecated: The Refinery agent decides retry strategy (ZFC #5).
func (m *Manager) pushWithRetry(targetBranch string, config MergeConfig) error {
var lastErr error
delay := time.Duration(config.PushRetryDelayMs) * time.Millisecond
for attempt := 0; attempt <= config.PushRetryCount; attempt++ {
if attempt > 0 {
_, _ = fmt.Fprintf(m.output, "Push retry %d/%d after %v\n", attempt, config.PushRetryCount, delay)
time.Sleep(delay)
delay *= 2 // Exponential backoff
}
err := util.ExecRun(m.workDir, "git", "push", "origin", targetBranch)
if err == nil {
return nil // Success
}
lastErr = err
}
return fmt.Errorf("push failed after %d retries: %v", config.PushRetryCount, lastErr)
}
// formatAge formats a duration since the given time.
func formatAge(t time.Time) string {
d := time.Since(t)
if d < time.Minute {
return fmt.Sprintf("%ds ago", int(d.Seconds()))
}
if d < time.Hour {
return fmt.Sprintf("%dm ago", int(d.Minutes()))
}
if d < 24*time.Hour {
return fmt.Sprintf("%dh ago", int(d.Hours()))
}
return fmt.Sprintf("%dd ago", int(d.Hours()/24))
}
// notifyWorkerConflict sends a conflict notification to a polecat.
func (m *Manager) notifyWorkerConflict(mr *MergeRequest) {
router := mail.NewRouter(m.workDir)
msg := &mail.Message{
From: fmt.Sprintf("%s/refinery", m.rig.Name),
To: fmt.Sprintf("%s/%s", m.rig.Name, mr.Worker),
Subject: "Merge conflict - rebase required",
Body: fmt.Sprintf(`Your branch %s has conflicts with %s.
Please rebase your changes:
git fetch origin
git rebase origin/%s
git push -f
Then the Refinery will retry the merge.`,
mr.Branch, mr.TargetBranch, mr.TargetBranch),
Priority: mail.PriorityHigh,
}
_ = router.Send(msg) // best-effort notification
}
// notifyWorkerMerged sends a success notification to a polecat.
func (m *Manager) notifyWorkerMerged(mr *MergeRequest) {
router := mail.NewRouter(m.workDir)
msg := &mail.Message{
From: fmt.Sprintf("%s/refinery", m.rig.Name),
To: fmt.Sprintf("%s/%s", m.rig.Name, mr.Worker),
Subject: "Work merged successfully",
Body: fmt.Sprintf(`Your branch %s has been merged to %s.
Issue: %s
Thank you for your contribution!`,
mr.Branch, mr.TargetBranch, mr.IssueID),
}
_ = router.Send(msg) // best-effort notification
}
// Common errors for MR operations
var (
ErrMRNotFound = errors.New("merge request not found")
ErrMRNotFailed = errors.New("merge request has not failed")
)
// GetMR returns a merge request by ID.
// ZFC-compliant: delegates to FindMR which uses beads as source of truth.
// Deprecated: Use FindMR directly for more flexible matching.
func (m *Manager) GetMR(id string) (*MergeRequest, error) {
return m.FindMR(id)
}
// FindMR finds a merge request by ID or branch name in the queue.
func (m *Manager) FindMR(idOrBranch string) (*MergeRequest, error) {
queue, err := m.Queue()
if err != nil {
return nil, err
}
for _, item := range queue {
// Match by ID
if item.MR.ID == idOrBranch {
return item.MR, nil
}
// Match by branch name (with or without polecat/ prefix)
if item.MR.Branch == idOrBranch {
return item.MR, nil
}
if constants.BranchPolecatPrefix+idOrBranch == item.MR.Branch {
return item.MR, nil
}
// Match by worker name (partial match for convenience)
if strings.Contains(item.MR.ID, idOrBranch) {
return item.MR, nil
}
}
return nil, ErrMRNotFound
}
// Retry is deprecated - the Refinery agent handles retry logic autonomously.
// ZFC-compliant: no state file, agent uses beads issue status.
// The agent will automatically retry failed MRs in its patrol cycle.
func (m *Manager) Retry(_ string, _ bool) error {
_, _ = fmt.Fprintln(m.output, "Note: Retry is deprecated. The Refinery agent handles retries autonomously via beads.")
return nil
}
// RegisterMR is deprecated - MRs are registered via beads merge-request issues.
// ZFC-compliant: beads is the source of truth, not state file.
// Use 'gt mr create' or create a merge-request type bead directly.
func (m *Manager) RegisterMR(_ *MergeRequest) error {
return fmt.Errorf("RegisterMR is deprecated: use beads to create merge-request issues")
}
// RejectMR manually rejects a merge request.
// It closes the MR with rejected status and optionally notifies the worker.
// Returns the rejected MR for display purposes.
func (m *Manager) RejectMR(idOrBranch string, reason string, notify bool) (*MergeRequest, error) {
mr, err := m.FindMR(idOrBranch)
if err != nil {
return nil, err
}
// Verify MR is open or in_progress (can't reject already closed)
if mr.IsClosed() {
return nil, fmt.Errorf("%w: MR is already closed with reason: %s", ErrClosedImmutable, mr.CloseReason)
}
// Close the bead in storage with the rejection reason
b := beads.New(m.rig.BeadsPath())
if err := b.CloseWithReason("rejected: "+reason, mr.ID); err != nil {
return nil, fmt.Errorf("failed to close MR bead: %w", err)
}
// Update in-memory state for return value
if err := mr.Close(CloseReasonRejected); err != nil {
// Non-fatal: bead is already closed, just log
_, _ = fmt.Fprintf(m.output, "Warning: failed to update MR state: %v\n", err)
}
mr.Error = reason
// Optionally notify worker
if notify {
m.notifyWorkerRejected(mr, reason)
}
return mr, nil
}
// notifyWorkerRejected sends a rejection notification to a polecat.
func (m *Manager) notifyWorkerRejected(mr *MergeRequest, reason string) {
router := mail.NewRouter(m.workDir)
msg := &mail.Message{
From: fmt.Sprintf("%s/refinery", m.rig.Name),
To: fmt.Sprintf("%s/%s", m.rig.Name, mr.Worker),
Subject: "Merge request rejected",
Body: fmt.Sprintf(`Your merge request has been rejected.
Branch: %s
Issue: %s
Reason: %s
Please review the feedback and address the issues before resubmitting.`,
mr.Branch, mr.IssueID, reason),
Priority: mail.PriorityNormal,
}
_ = router.Send(msg) // best-effort notification
}
// findTownRoot walks up directories to find the town root.
func findTownRoot(startPath string) string {
path := startPath
for {
// Check for mayor/ subdirectory (indicates town root)
if _, err := os.Stat(filepath.Join(path, "mayor")); err == nil {
return path
}
// Check for config.json with type: workspace
configPath := filepath.Join(path, "config.json")
if data, err := os.ReadFile(configPath); err == nil {
if strings.Contains(string(data), `"type": "workspace"`) {
return path
}
}
parent := filepath.Dir(path)
if parent == path {
break // Reached root
}
path = parent
}
return ""
}