Files
gastown/internal/polecat/session_manager.go
nux eb18dbf9e2 fix(sling): verify session survives startup before returning success
The Start() function was returning success even if the pane died during
initialization (e.g., if Claude failed to start). This caused the caller
to get a confusing "getting pane" error when trying to use the session.

Now Start() verifies the session is still running at the end, returning
a clear error message if the session died during startup.

Fixes: gt-0cif0s

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-16 16:03:29 -08:00

509 lines
15 KiB
Go

// Package polecat provides polecat workspace and session management.
package polecat
import (
"encoding/json"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/rig"
"github.com/steveyegge/gastown/internal/runtime"
"github.com/steveyegge/gastown/internal/session"
"github.com/steveyegge/gastown/internal/tmux"
)
// debugSession logs non-fatal errors during session startup when GT_DEBUG_SESSION=1.
func debugSession(context string, err error) {
if os.Getenv("GT_DEBUG_SESSION") != "" && err != nil {
fmt.Fprintf(os.Stderr, "[session-debug] %s: %v\n", context, err)
}
}
// Session errors
var (
ErrSessionRunning = errors.New("session already running")
ErrSessionNotFound = errors.New("session not found")
ErrIssueInvalid = errors.New("issue not found or tombstoned")
)
// SessionManager handles polecat session lifecycle.
type SessionManager struct {
tmux *tmux.Tmux
rig *rig.Rig
}
// NewSessionManager creates a new polecat session manager for a rig.
func NewSessionManager(t *tmux.Tmux, r *rig.Rig) *SessionManager {
return &SessionManager{
tmux: t,
rig: r,
}
}
// SessionStartOptions configures polecat session startup.
type SessionStartOptions struct {
// WorkDir overrides the default working directory (polecat clone dir).
WorkDir string
// Issue is an optional issue ID to work on.
Issue string
// Command overrides the default "claude" command.
Command string
// Account specifies the account handle to use (overrides default).
Account string
// RuntimeConfigDir is resolved config directory for the runtime account.
// If set, this is injected as an environment variable.
RuntimeConfigDir string
}
// SessionInfo contains information about a running polecat session.
type SessionInfo struct {
// Polecat is the polecat name.
Polecat string `json:"polecat"`
// SessionID is the tmux session identifier.
SessionID string `json:"session_id"`
// Running indicates if the session is currently active.
Running bool `json:"running"`
// RigName is the rig this session belongs to.
RigName string `json:"rig_name"`
// Attached indicates if someone is attached to the session.
Attached bool `json:"attached,omitempty"`
// Created is when the session was created.
Created time.Time `json:"created,omitempty"`
// Windows is the number of tmux windows.
Windows int `json:"windows,omitempty"`
// LastActivity is when the session last had activity.
LastActivity time.Time `json:"last_activity,omitempty"`
}
// SessionName generates the tmux session name for a polecat.
func (m *SessionManager) SessionName(polecat string) string {
return fmt.Sprintf("gt-%s-%s", m.rig.Name, polecat)
}
// polecatDir returns the parent directory for a polecat.
// This is polecats/<name>/ - the polecat's home directory.
func (m *SessionManager) polecatDir(polecat string) string {
return filepath.Join(m.rig.Path, "polecats", polecat)
}
// clonePath returns the path where the git worktree lives.
// New structure: polecats/<name>/<rigname>/ - gives LLMs recognizable repo context.
// Falls back to old structure: polecats/<name>/ for backward compatibility.
func (m *SessionManager) clonePath(polecat string) string {
// New structure: polecats/<name>/<rigname>/
newPath := filepath.Join(m.rig.Path, "polecats", polecat, m.rig.Name)
if info, err := os.Stat(newPath); err == nil && info.IsDir() {
return newPath
}
// Old structure: polecats/<name>/ (backward compat)
oldPath := filepath.Join(m.rig.Path, "polecats", polecat)
if info, err := os.Stat(oldPath); err == nil && info.IsDir() {
// Check if this is actually a git worktree (has .git file or dir)
gitPath := filepath.Join(oldPath, ".git")
if _, err := os.Stat(gitPath); err == nil {
return oldPath
}
}
// Default to new structure for new polecats
return newPath
}
// hasPolecat checks if the polecat exists in this rig.
func (m *SessionManager) hasPolecat(polecat string) bool {
polecatPath := m.polecatDir(polecat)
info, err := os.Stat(polecatPath)
if err != nil {
return false
}
return info.IsDir()
}
// Start creates and starts a new session for a polecat.
func (m *SessionManager) Start(polecat string, opts SessionStartOptions) error {
if !m.hasPolecat(polecat) {
return fmt.Errorf("%w: %s", ErrPolecatNotFound, polecat)
}
sessionID := m.SessionName(polecat)
// Check if session already exists
// Note: Orphan sessions are cleaned up by ReconcilePool during AllocateName,
// so by this point, any existing session should be legitimately in use.
running, err := m.tmux.HasSession(sessionID)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if running {
return fmt.Errorf("%w: %s", ErrSessionRunning, sessionID)
}
// Determine working directory
workDir := opts.WorkDir
if workDir == "" {
workDir = m.clonePath(polecat)
}
// Validate issue exists and isn't tombstoned BEFORE creating session.
// This prevents CPU spin loops from agents retrying work on invalid issues.
if opts.Issue != "" {
if err := m.validateIssue(opts.Issue, workDir); err != nil {
return err
}
}
runtimeConfig := config.LoadRuntimeConfig(m.rig.Path)
// Ensure runtime settings exist in polecats/ (not polecats/<name>/) so we don't
// write into the source repo. Runtime walks up the tree to find settings.
polecatsDir := filepath.Join(m.rig.Path, "polecats")
if err := runtime.EnsureSettingsForRole(polecatsDir, "polecat", runtimeConfig); err != nil {
return fmt.Errorf("ensuring runtime settings: %w", err)
}
// Build startup command first
command := opts.Command
if command == "" {
command = config.BuildPolecatStartupCommand(m.rig.Name, polecat, m.rig.Path, "")
}
// Prepend runtime config dir env if needed
if runtimeConfig.Session != nil && runtimeConfig.Session.ConfigDirEnv != "" && opts.RuntimeConfigDir != "" {
command = config.PrependEnv(command, map[string]string{runtimeConfig.Session.ConfigDirEnv: opts.RuntimeConfigDir})
}
// Create session with command directly to avoid send-keys race condition.
// See: https://github.com/anthropics/gastown/issues/280
if err := m.tmux.NewSessionWithCommand(sessionID, workDir, command); err != nil {
return fmt.Errorf("creating session: %w", err)
}
// Set environment (non-fatal: session works without these)
// Use centralized AgentEnv for consistency across all role startup paths
townRoot := filepath.Dir(m.rig.Path)
envVars := config.AgentEnv(config.AgentEnvConfig{
Role: "polecat",
Rig: m.rig.Name,
AgentName: polecat,
TownRoot: townRoot,
RuntimeConfigDir: opts.RuntimeConfigDir,
BeadsNoDaemon: true,
})
for k, v := range envVars {
debugSession("SetEnvironment "+k, m.tmux.SetEnvironment(sessionID, k, v))
}
// Hook the issue to the polecat if provided via --issue flag
if opts.Issue != "" {
agentID := fmt.Sprintf("%s/polecats/%s", m.rig.Name, polecat)
if err := m.hookIssue(opts.Issue, agentID, workDir); err != nil {
fmt.Printf("Warning: could not hook issue %s: %v\n", opts.Issue, err)
}
}
// Apply theme (non-fatal)
theme := tmux.AssignTheme(m.rig.Name)
debugSession("ConfigureGasTownSession", m.tmux.ConfigureGasTownSession(sessionID, theme, m.rig.Name, polecat, "polecat"))
// Set pane-died hook for crash detection (non-fatal)
agentID := fmt.Sprintf("%s/%s", m.rig.Name, polecat)
debugSession("SetPaneDiedHook", m.tmux.SetPaneDiedHook(sessionID, agentID))
// Wait for Claude to start (non-fatal)
debugSession("WaitForCommand", m.tmux.WaitForCommand(sessionID, constants.SupportedShells, constants.ClaudeStartTimeout))
// Accept bypass permissions warning dialog if it appears
debugSession("AcceptBypassPermissionsWarning", m.tmux.AcceptBypassPermissionsWarning(sessionID))
// Wait for runtime to be fully ready at the prompt (not just started)
runtime.SleepForReadyDelay(runtimeConfig)
_ = runtime.RunStartupFallback(m.tmux, sessionID, "polecat", runtimeConfig)
// Inject startup nudge for predecessor discovery via /resume
address := fmt.Sprintf("%s/polecats/%s", m.rig.Name, polecat)
debugSession("StartupNudge", session.StartupNudge(m.tmux, sessionID, session.StartupNudgeConfig{
Recipient: address,
Sender: "witness",
Topic: "assigned",
MolID: opts.Issue,
}))
// GUPP: Send propulsion nudge to trigger autonomous work execution
time.Sleep(2 * time.Second)
debugSession("NudgeSession PropulsionNudge", m.tmux.NudgeSession(sessionID, session.PropulsionNudge()))
// Verify session survived startup - if the command crashed, the session may have died.
// Without this check, Start() would return success even if the pane died during initialization.
running, err = m.tmux.HasSession(sessionID)
if err != nil {
return fmt.Errorf("verifying session: %w", err)
}
if !running {
return fmt.Errorf("session %s died during startup (agent command may have failed)", sessionID)
}
return nil
}
// Stop terminates a polecat session.
func (m *SessionManager) Stop(polecat string, force bool) error {
sessionID := m.SessionName(polecat)
running, err := m.tmux.HasSession(sessionID)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if !running {
return ErrSessionNotFound
}
// Sync beads before shutdown (non-fatal)
if !force {
polecatDir := m.polecatDir(polecat)
if err := m.syncBeads(polecatDir); err != nil {
fmt.Printf("Warning: beads sync failed: %v\n", err)
}
}
// Try graceful shutdown first
if !force {
_ = m.tmux.SendKeysRaw(sessionID, "C-c")
time.Sleep(100 * time.Millisecond)
}
if err := m.tmux.KillSession(sessionID); err != nil {
return fmt.Errorf("killing session: %w", err)
}
return nil
}
// syncBeads runs bd sync in the given directory.
func (m *SessionManager) syncBeads(workDir string) error {
cmd := exec.Command("bd", "sync")
cmd.Dir = workDir
return cmd.Run()
}
// IsRunning checks if a polecat session is active.
func (m *SessionManager) IsRunning(polecat string) (bool, error) {
sessionID := m.SessionName(polecat)
return m.tmux.HasSession(sessionID)
}
// Status returns detailed status for a polecat session.
func (m *SessionManager) Status(polecat string) (*SessionInfo, error) {
sessionID := m.SessionName(polecat)
running, err := m.tmux.HasSession(sessionID)
if err != nil {
return nil, fmt.Errorf("checking session: %w", err)
}
info := &SessionInfo{
Polecat: polecat,
SessionID: sessionID,
Running: running,
RigName: m.rig.Name,
}
if !running {
return info, nil
}
tmuxInfo, err := m.tmux.GetSessionInfo(sessionID)
if err != nil {
return info, nil
}
info.Attached = tmuxInfo.Attached
info.Windows = tmuxInfo.Windows
if tmuxInfo.Created != "" {
formats := []string{
"Mon Jan 2 15:04:05 2006",
"Mon Jan _2 15:04:05 2006",
time.ANSIC,
time.UnixDate,
}
for _, format := range formats {
if t, err := time.Parse(format, tmuxInfo.Created); err == nil {
info.Created = t
break
}
}
}
if tmuxInfo.Activity != "" {
var activityUnix int64
if _, err := fmt.Sscanf(tmuxInfo.Activity, "%d", &activityUnix); err == nil && activityUnix > 0 {
info.LastActivity = time.Unix(activityUnix, 0)
}
}
return info, nil
}
// List returns information about all polecat sessions for this rig.
func (m *SessionManager) List() ([]SessionInfo, error) {
sessions, err := m.tmux.ListSessions()
if err != nil {
return nil, err
}
prefix := fmt.Sprintf("gt-%s-", m.rig.Name)
var infos []SessionInfo
for _, sessionID := range sessions {
if !strings.HasPrefix(sessionID, prefix) {
continue
}
polecat := strings.TrimPrefix(sessionID, prefix)
infos = append(infos, SessionInfo{
Polecat: polecat,
SessionID: sessionID,
Running: true,
RigName: m.rig.Name,
})
}
return infos, nil
}
// Attach attaches to a polecat session.
func (m *SessionManager) Attach(polecat string) error {
sessionID := m.SessionName(polecat)
running, err := m.tmux.HasSession(sessionID)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if !running {
return ErrSessionNotFound
}
return m.tmux.AttachSession(sessionID)
}
// Capture returns the recent output from a polecat session.
func (m *SessionManager) Capture(polecat string, lines int) (string, error) {
sessionID := m.SessionName(polecat)
running, err := m.tmux.HasSession(sessionID)
if err != nil {
return "", fmt.Errorf("checking session: %w", err)
}
if !running {
return "", ErrSessionNotFound
}
return m.tmux.CapturePane(sessionID, lines)
}
// CaptureSession returns the recent output from a session by raw session ID.
func (m *SessionManager) CaptureSession(sessionID string, lines int) (string, error) {
running, err := m.tmux.HasSession(sessionID)
if err != nil {
return "", fmt.Errorf("checking session: %w", err)
}
if !running {
return "", ErrSessionNotFound
}
return m.tmux.CapturePane(sessionID, lines)
}
// Inject sends a message to a polecat session.
func (m *SessionManager) Inject(polecat, message string) error {
sessionID := m.SessionName(polecat)
running, err := m.tmux.HasSession(sessionID)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if !running {
return ErrSessionNotFound
}
debounceMs := 200 + (len(message)/1024)*100
if debounceMs > 1500 {
debounceMs = 1500
}
return m.tmux.SendKeysDebounced(sessionID, message, debounceMs)
}
// StopAll terminates all polecat sessions for this rig.
func (m *SessionManager) StopAll(force bool) error {
infos, err := m.List()
if err != nil {
return err
}
var lastErr error
for _, info := range infos {
if err := m.Stop(info.Polecat, force); err != nil {
lastErr = err
}
}
return lastErr
}
// validateIssue checks that an issue exists and is not tombstoned.
// This must be called before starting a session to avoid CPU spin loops
// from agents retrying work on invalid issues.
func (m *SessionManager) validateIssue(issueID, workDir string) error {
cmd := exec.Command("bd", "show", issueID, "--json") //nolint:gosec
cmd.Dir = workDir
output, err := cmd.Output()
if err != nil {
return fmt.Errorf("%w: %s", ErrIssueInvalid, issueID)
}
var issues []struct {
Status string `json:"status"`
}
if err := json.Unmarshal(output, &issues); err != nil {
return fmt.Errorf("parsing issue: %w", err)
}
if len(issues) == 0 {
return fmt.Errorf("%w: %s", ErrIssueInvalid, issueID)
}
if issues[0].Status == "tombstone" {
return fmt.Errorf("%w: %s is tombstoned", ErrIssueInvalid, issueID)
}
return nil
}
// hookIssue pins an issue to a polecat's hook using bd update.
func (m *SessionManager) hookIssue(issueID, agentID, workDir string) error {
cmd := exec.Command("bd", "update", issueID, "--status=hooked", "--assignee="+agentID) //nolint:gosec
cmd.Dir = workDir
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
return fmt.Errorf("bd update failed: %w", err)
}
fmt.Printf("✓ Hooked issue %s to %s\n", issueID, agentID)
return nil
}