From afff85cdffff838bb808b4e44621dc1cac8d692d Mon Sep 17 00:00:00 2001 From: jack Date: Thu, 8 Jan 2026 23:35:31 -0800 Subject: [PATCH] fix(tmux): use NewSessionWithCommand to avoid send-keys race condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent sessions would fail on startup because send-keys arrived before the shell was ready, causing 'bad pattern' and 'command not found' errors. Fix: Create sessions with the command directly using tmux new-session's command argument. This runs the agent as the pane's initial process, avoiding shell readiness timing issues entirely. Updated all agent managers: mayor, deacon, witness, refinery, polecat, crew. Also fixes pre-existing build error in polecat/manager.go (polecatPath → clonePath/newClonePath). Closes #280 Co-Authored-By: Claude Opus 4.5 --- internal/crew/manager.go | 65 +++++++++++++---------------- internal/deacon/manager.go | 30 +++++-------- internal/mayor/manager.go | 31 ++++++-------- internal/polecat/manager.go | 8 ++-- internal/polecat/session_manager.go | 33 ++++++--------- internal/refinery/manager.go | 25 ++++------- internal/tmux/tmux.go | 16 +++++++ internal/witness/manager.go | 29 ++++--------- 8 files changed, 99 insertions(+), 138 deletions(-) diff --git a/internal/crew/manager.go b/internal/crew/manager.go index 70e31198..8270ec35 100644 --- a/internal/crew/manager.go +++ b/internal/crew/manager.go @@ -482,8 +482,34 @@ func (m *Manager) Start(name string, opts StartOptions) error { return fmt.Errorf("ensuring Claude settings: %w", err) } - // Create tmux session - if err := t.NewSession(sessionID, worker.ClonePath); err != nil { + // Build the startup beacon for predecessor discovery via /resume + // Pass it as Claude's initial prompt - processed when Claude is ready + address := fmt.Sprintf("%s/crew/%s", m.rig.Name, name) + topic := opts.Topic + if topic == "" { + topic = "start" + } + beacon := session.FormatStartupNudge(session.StartupNudgeConfig{ + Recipient: address, + Sender: "human", + Topic: topic, + }) + + // Build startup command first + // SessionStart hook handles context loading (gt prime --hook) + claudeCmd, err := config.BuildCrewStartupCommandWithAgentOverride(m.rig.Name, name, m.rig.Path, beacon, opts.AgentOverride) + if err != nil { + return fmt.Errorf("building startup command: %w", err) + } + + // For interactive/refresh mode, remove --dangerously-skip-permissions + if opts.Interactive { + claudeCmd = strings.Replace(claudeCmd, " --dangerously-skip-permissions", "", 1) + } + + // Create session with command directly to avoid send-keys race condition. + // See: https://github.com/anthropics/gastown/issues/280 + if err := t.NewSessionWithCommand(sessionID, worker.ClonePath, claudeCmd); err != nil { return fmt.Errorf("creating session: %w", err) } @@ -504,41 +530,6 @@ func (m *Manager) Start(name string, opts StartOptions) error { // Set up C-b n/p keybindings for crew session cycling (non-fatal) _ = t.SetCrewCycleBindings(sessionID) - // Wait for shell to be ready - if err := t.WaitForShellReady(sessionID, constants.ShellReadyTimeout); err != nil { - return fmt.Errorf("waiting for shell: %w", err) - } - - // Build the startup beacon for predecessor discovery via /resume - // Pass it as Claude's initial prompt - processed when Claude is ready - address := fmt.Sprintf("%s/crew/%s", m.rig.Name, name) - topic := opts.Topic - if topic == "" { - topic = "start" - } - beacon := session.FormatStartupNudge(session.StartupNudgeConfig{ - Recipient: address, - Sender: "human", - Topic: topic, - }) - - // Start claude with environment exports and beacon as initial prompt - // SessionStart hook handles context loading (gt prime --hook) - claudeCmd, err := config.BuildCrewStartupCommandWithAgentOverride(m.rig.Name, name, m.rig.Path, beacon, opts.AgentOverride) - if err != nil { - _ = t.KillSession(sessionID) - return fmt.Errorf("building startup command: %w", err) - } - - // For interactive/refresh mode, remove --dangerously-skip-permissions - if opts.Interactive { - claudeCmd = strings.Replace(claudeCmd, " --dangerously-skip-permissions", "", 1) - } - if err := t.SendKeys(sessionID, claudeCmd); err != nil { - _ = t.KillSession(sessionID) // best-effort cleanup - return fmt.Errorf("starting claude: %w", err) - } - // Wait for Claude to start (non-fatal: session continues even if this times out) _ = t.WaitForCommand(sessionID, constants.SupportedShells, constants.ClaudeStartTimeout) diff --git a/internal/deacon/manager.go b/internal/deacon/manager.go index 58af1234..d1d77d81 100644 --- a/internal/deacon/manager.go +++ b/internal/deacon/manager.go @@ -79,8 +79,16 @@ func (m *Manager) Start(agentOverride string) error { return fmt.Errorf("ensuring Claude settings: %w", err) } - // Create new tmux session - if err := t.NewSession(sessionID, deaconDir); err != nil { + // Build startup command first + // Restarts are handled by daemon via ensureDeaconRunning on each heartbeat + startupCmd, err := config.BuildAgentStartupCommandWithAgentOverride("deacon", "deacon", "", "", agentOverride) + if err != nil { + return fmt.Errorf("building startup command: %w", err) + } + + // Create session with command directly to avoid send-keys race condition. + // See: https://github.com/anthropics/gastown/issues/280 + if err := t.NewSessionWithCommand(sessionID, deaconDir, startupCmd); err != nil { return fmt.Errorf("creating tmux session: %w", err) } @@ -92,24 +100,6 @@ func (m *Manager) Start(agentOverride string) error { theme := tmux.DeaconTheme() _ = t.ConfigureGasTownSession(sessionID, theme, "", "Deacon", "health-check") - // Launch Claude directly (no shell respawn loop) - // Restarts are handled by daemon via ensureDeaconRunning on each heartbeat - startupCmd, err := config.BuildAgentStartupCommandWithAgentOverride("deacon", "deacon", "", "", agentOverride) - if err != nil { - _ = t.KillSession(sessionID) - return fmt.Errorf("building startup command: %w", err) - } - - // Wait for shell to be ready before sending keys (prevents "can't find pane" under load) - if err := t.WaitForShellReady(sessionID, 5*time.Second); err != nil { - _ = t.KillSession(sessionID) - return fmt.Errorf("waiting for shell: %w", err) - } - if err := t.SendKeysDelayed(sessionID, startupCmd, 200); err != nil { - _ = t.KillSession(sessionID) // best-effort cleanup - return fmt.Errorf("starting Claude agent: %w", err) - } - // Wait for Claude to start (non-fatal) if err := t.WaitForCommand(sessionID, constants.SupportedShells, constants.ClaudeStartTimeout); err != nil { // Non-fatal - try to continue anyway diff --git a/internal/mayor/manager.go b/internal/mayor/manager.go index 3a6a985b..6510a219 100644 --- a/internal/mayor/manager.go +++ b/internal/mayor/manager.go @@ -78,8 +78,18 @@ func (m *Manager) Start(agentOverride string) error { return fmt.Errorf("ensuring Claude settings: %w", err) } - // Create new tmux session - if err := t.NewSession(sessionID, mayorDir); err != nil { + // Build startup command first - the startup hook handles 'gt prime' automatically + // Export GT_ROLE and BD_ACTOR in the command since tmux SetEnvironment only affects new panes + startupCmd, err := config.BuildAgentStartupCommandWithAgentOverride("mayor", "mayor", "", "", agentOverride) + if err != nil { + return fmt.Errorf("building startup command: %w", err) + } + + // Create session with command directly to avoid send-keys race condition. + // This runs the command as the pane's initial process, avoiding the shell + // readiness timing issues that cause "bad pattern" and command-not-found errors. + // See: https://github.com/anthropics/gastown/issues/280 + if err := t.NewSessionWithCommand(sessionID, mayorDir, startupCmd); err != nil { return fmt.Errorf("creating tmux session: %w", err) } @@ -91,23 +101,6 @@ func (m *Manager) Start(agentOverride string) error { theme := tmux.MayorTheme() _ = t.ConfigureGasTownSession(sessionID, theme, "", "Mayor", "coordinator") - // Launch Claude - the startup hook handles 'gt prime' automatically - // Export GT_ROLE and BD_ACTOR in the command since tmux SetEnvironment only affects new panes - startupCmd, err := config.BuildAgentStartupCommandWithAgentOverride("mayor", "mayor", "", "", agentOverride) - if err != nil { - _ = t.KillSession(sessionID) // best-effort cleanup - return fmt.Errorf("building startup command: %w", err) - } - // Wait for shell to be ready before sending keys (prevents "can't find pane" under load) - if err := t.WaitForShellReady(sessionID, 5*time.Second); err != nil { - _ = t.KillSession(sessionID) - return fmt.Errorf("waiting for shell: %w", err) - } - if err := t.SendKeysDelayed(sessionID, startupCmd, 200); err != nil { - _ = t.KillSession(sessionID) // best-effort cleanup - return fmt.Errorf("starting Claude agent: %w", err) - } - // Wait for Claude to start (non-fatal) if err := t.WaitForCommand(sessionID, constants.SupportedShells, constants.ClaudeStartTimeout); err != nil { // Non-fatal - try to continue anyway diff --git a/internal/polecat/manager.go b/internal/polecat/manager.go index f80b6c36..fae55705 100644 --- a/internal/polecat/manager.go +++ b/internal/polecat/manager.go @@ -281,7 +281,7 @@ func (m *Manager) AddWithOptions(name string, opts AddOptions) (*Polecat, error) // Copy overlay files from .runtime/overlay/ to polecat root. // This allows services to have .env and other config files at their root. - if err := rig.CopyOverlay(m.rig.Path, polecatPath); err != nil { + if err := rig.CopyOverlay(m.rig.Path, clonePath); err != nil { // Non-fatal - log warning but continue fmt.Printf("Warning: could not copy overlay files: %v\n", err) } @@ -538,7 +538,7 @@ func (m *Manager) RepairWorktreeWithOptions(name string, force bool, opts AddOpt } // Copy overlay files from .runtime/overlay/ to polecat root. - if err := rig.CopyOverlay(m.rig.Path, polecatPath); err != nil { + if err := rig.CopyOverlay(m.rig.Path, newClonePath); err != nil { fmt.Printf("Warning: could not copy overlay files: %v\n", err) } @@ -787,9 +787,9 @@ func (m *Manager) loadFromBeads(name string) (*Polecat, error) { // setupSharedBeads creates a redirect file so the polecat uses the rig's shared .beads database. // This eliminates the need for git sync between polecat clones - all polecats share one database. -func (m *Manager) setupSharedBeads(polecatPath string) error { +func (m *Manager) setupSharedBeads(clonePath string) error { townRoot := filepath.Dir(m.rig.Path) - return beads.SetupRedirect(townRoot, polecatPath) + return beads.SetupRedirect(townRoot, clonePath) } // CleanupStaleBranches removes orphaned polecat branches that are no longer in use. diff --git a/internal/polecat/session_manager.go b/internal/polecat/session_manager.go index ab202d6f..469b5d90 100644 --- a/internal/polecat/session_manager.go +++ b/internal/polecat/session_manager.go @@ -168,8 +168,19 @@ func (m *SessionManager) Start(polecat string, opts SessionStartOptions) error { return fmt.Errorf("ensuring runtime settings: %w", err) } - // Create session - if err := m.tmux.NewSession(sessionID, workDir); err != nil { + // Build startup command first + command := opts.Command + if command == "" { + command = config.BuildPolecatStartupCommand(m.rig.Name, polecat, m.rig.Path, "") + } + // Prepend runtime config dir env if needed + if runtimeConfig.Session != nil && runtimeConfig.Session.ConfigDirEnv != "" && opts.RuntimeConfigDir != "" { + command = config.PrependEnv(command, map[string]string{runtimeConfig.Session.ConfigDirEnv: opts.RuntimeConfigDir}) + } + + // Create session with command directly to avoid send-keys race condition. + // See: https://github.com/anthropics/gastown/issues/280 + if err := m.tmux.NewSessionWithCommand(sessionID, workDir, command); err != nil { return fmt.Errorf("creating session: %w", err) } @@ -205,24 +216,6 @@ func (m *SessionManager) Start(polecat string, opts SessionStartOptions) error { agentID := fmt.Sprintf("%s/%s", m.rig.Name, polecat) debugSession("SetPaneDiedHook", m.tmux.SetPaneDiedHook(sessionID, agentID)) - // Send initial command with env vars exported inline - command := opts.Command - if command == "" { - command = config.BuildPolecatStartupCommand(m.rig.Name, polecat, m.rig.Path, "") - } - // Prepend runtime config dir env if needed - if runtimeConfig.Session != nil && runtimeConfig.Session.ConfigDirEnv != "" && opts.RuntimeConfigDir != "" { - command = config.PrependEnv(command, map[string]string{runtimeConfig.Session.ConfigDirEnv: opts.RuntimeConfigDir}) - } - // Wait for shell to be ready before sending keys (prevents "can't find pane" under load) - if err := m.tmux.WaitForShellReady(sessionID, 5*time.Second); err != nil { - _ = m.tmux.KillSession(sessionID) - return fmt.Errorf("waiting for shell: %w", err) - } - if err := m.tmux.SendKeys(sessionID, command); err != nil { - return fmt.Errorf("sending command: %w", err) - } - // Wait for Claude to start (non-fatal) debugSession("WaitForCommand", m.tmux.WaitForCommand(sessionID, constants.SupportedShells, constants.ClaudeStartTimeout)) diff --git a/internal/refinery/manager.go b/internal/refinery/manager.go index c2540b7b..e30d4143 100644 --- a/internal/refinery/manager.go +++ b/internal/refinery/manager.go @@ -174,12 +174,17 @@ func (m *Manager) Start(foreground bool) error { return fmt.Errorf("ensuring runtime settings: %w", err) } - if err := t.NewSession(sessionID, refineryRigDir); err != nil { + // Build startup command first + bdActor := fmt.Sprintf("%s/refinery", m.rig.Name) + command := config.BuildAgentStartupCommand("refinery", bdActor, m.rig.Path, "") + + // Create session with command directly to avoid send-keys race condition. + // See: https://github.com/anthropics/gastown/issues/280 + if err := t.NewSessionWithCommand(sessionID, refineryRigDir, command); err != nil { return fmt.Errorf("creating tmux session: %w", err) } // Set environment variables (non-fatal: session works without these) - bdActor := fmt.Sprintf("%s/refinery", m.rig.Name) _ = t.SetEnvironment(sessionID, "GT_RIG", m.rig.Name) _ = t.SetEnvironment(sessionID, "GT_REFINERY", "1") _ = t.SetEnvironment(sessionID, "GT_ROLE", "refinery") @@ -206,22 +211,6 @@ func (m *Manager) Start(foreground bool) error { return fmt.Errorf("saving state: %w", err) } - // Start Claude agent with full permissions (like polecats) - // NOTE: No gt prime injection needed - SessionStart hook handles it automatically - // Restarts are handled by daemon via LIFECYCLE mail, not shell loops - // Export GT_ROLE and BD_ACTOR in the command since tmux SetEnvironment only affects new panes - command := config.BuildAgentStartupCommand("refinery", bdActor, m.rig.Path, "") - // Wait for shell to be ready before sending keys (prevents "can't find pane" under load) - if err := t.WaitForShellReady(sessionID, 5*time.Second); err != nil { - _ = t.KillSession(sessionID) - return fmt.Errorf("waiting for shell: %w", err) - } - if err := t.SendKeys(sessionID, command); err != nil { - // Clean up the session on failure (best-effort cleanup) - _ = t.KillSession(sessionID) - return fmt.Errorf("starting Claude agent: %w", err) - } - // Wait for Claude to start and show its prompt (non-fatal) // WaitForRuntimeReady waits for the runtime to be ready if err := t.WaitForRuntimeReady(sessionID, runtimeConfig, constants.ClaudeStartTimeout); err != nil { diff --git a/internal/tmux/tmux.go b/internal/tmux/tmux.go index a86a1527..54afcdf3 100644 --- a/internal/tmux/tmux.go +++ b/internal/tmux/tmux.go @@ -78,6 +78,22 @@ func (t *Tmux) NewSession(name, workDir string) error { return err } +// NewSessionWithCommand creates a new detached tmux session that immediately runs a command. +// Unlike NewSession + SendKeys, this avoids race conditions where the shell isn't ready +// or the command arrives before the shell prompt. The command runs directly as the +// initial process of the pane. +// See: https://github.com/anthropics/gastown/issues/280 +func (t *Tmux) NewSessionWithCommand(name, workDir, command string) error { + args := []string{"new-session", "-d", "-s", name} + if workDir != "" { + args = append(args, "-c", workDir) + } + // Add the command as the last argument - tmux runs it as the pane's initial process + args = append(args, command) + _, err := t.run(args...) + return err +} + // EnsureSessionFresh ensures a session is available and healthy. // If the session exists but is a zombie (Claude not running), it kills the session first. // This prevents "session already exists" errors when trying to restart dead agents. diff --git a/internal/witness/manager.go b/internal/witness/manager.go index 56830cca..950fc782 100644 --- a/internal/witness/manager.go +++ b/internal/witness/manager.go @@ -151,13 +151,19 @@ func (m *Manager) Start(foreground bool) error { return fmt.Errorf("ensuring Claude settings: %w", err) } - // Create new tmux session - if err := t.NewSession(sessionID, witnessDir); err != nil { + // Build startup command first + // Pass m.rig.Path so rig agent settings are honored (not town-level defaults) + bdActor := fmt.Sprintf("%s/witness", m.rig.Name) + command := config.BuildAgentStartupCommand("witness", bdActor, m.rig.Path, "") + runtimeConfig := config.LoadRuntimeConfig(m.rig.Path) + + // Create session with command directly to avoid send-keys race condition. + // See: https://github.com/anthropics/gastown/issues/280 + if err := t.NewSessionWithCommand(sessionID, witnessDir, command); err != nil { return fmt.Errorf("creating tmux session: %w", err) } // Set environment variables (non-fatal: session works without these) - bdActor := fmt.Sprintf("%s/witness", m.rig.Name) _ = t.SetEnvironment(sessionID, "GT_ROLE", "witness") _ = t.SetEnvironment(sessionID, "GT_RIG", m.rig.Name) _ = t.SetEnvironment(sessionID, "BD_ACTOR", bdActor) @@ -177,23 +183,6 @@ func (m *Manager) Start(foreground bool) error { return fmt.Errorf("saving state: %w", err) } - // Launch Claude directly (no shell respawn loop) - // Restarts are handled by daemon via LIFECYCLE mail or deacon health-scan - // NOTE: No gt prime injection needed - SessionStart hook handles it automatically - // Export GT_ROLE and BD_ACTOR in the command since tmux SetEnvironment only affects new panes - // Pass m.rig.Path so rig agent settings are honored (not town-level defaults) - command := config.BuildAgentStartupCommand("witness", bdActor, m.rig.Path, "") - runtimeConfig := config.LoadRuntimeConfig(m.rig.Path) - // Wait for shell to be ready before sending keys (prevents "can't find pane" under load) - if err := t.WaitForShellReady(sessionID, 5*time.Second); err != nil { - _ = t.KillSession(sessionID) - return fmt.Errorf("waiting for shell: %w", err) - } - if err := t.SendKeys(sessionID, command); err != nil { - _ = t.KillSession(sessionID) // best-effort cleanup - return fmt.Errorf("starting Claude agent: %w", err) - } - // Wait for runtime to start and show its prompt (non-fatal) if err := t.WaitForRuntimeReady(sessionID, runtimeConfig, constants.ClaudeStartTimeout); err != nil { // Non-fatal - try to continue anyway