feat: runtime-aware tmux agent checks

2026-01-07 12:56:00 +13:00
parent 02ca9e43fa
commit 22693c1dcc
9 changed files with 117 additions and 46 deletions
@@ -150,8 +150,8 @@ func runLiveCosts() error {
 		// Extract cost from content
 		cost := extractCost(content)

-		// Check if Claude is running
-		running := t.IsClaudeRunning(session)
+		// Check if an agent appears to be running
+		running := t.IsAgentRunning(session)

 		costs = append(costs, SessionCost{
 			Session: session,
@@ -428,7 +428,6 @@ func extractCost(content string) float64 {
 	return cost
 }

-
 func outputCostsJSON(output CostsOutput) error {
 	enc := json.NewEncoder(os.Stdout)
 	enc.SetIndent("", "  ")
@@ -89,9 +89,9 @@ func runCrewAt(cmd *cobra.Command, args []string) error {
 	if !hasSession {
 		existingSessions, err := t.FindSessionByWorkDir(worker.ClonePath, true)
 		if err == nil && len(existingSessions) > 0 {
-			// Found an existing session with Claude running in this directory
+			// Found an existing session with an agent running in this directory
 			existingSession := existingSessions[0]
-			fmt.Printf("%s Found existing Claude session '%s' in crew directory\n",
+			fmt.Printf("%s Found existing agent session '%s' in crew directory\n",
 				style.Warning.Render("⚠"),
 				existingSession)
 			fmt.Printf("  Attaching to existing session instead of creating a new one\n")
@@ -164,7 +164,11 @@ func runCrewAt(cmd *cobra.Command, args []string) error {
 		// Session exists - check if Claude is still running
 		// Uses both pane command check and UI marker detection to avoid
 		// restarting when user is in a subshell spawned from Claude
-		if !t.IsClaudeRunning(sessionID) {
+		agentCfg, _, err := config.ResolveAgentConfigWithOverride(townRoot, r.Path, crewAgentOverride)
+		if err != nil {
+			return fmt.Errorf("resolving agent: %w", err)
+		}
+		if !t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
 			// Claude has exited, restart it using respawn-pane
 			fmt.Printf("Claude exited, restarting...\n")

@@ -447,13 +447,13 @@ func runSling(cmd *cobra.Command, args []string) error {
 	if targetPane == "" {
 		fmt.Printf("%s No pane to nudge (agent will discover work via gt prime)\n", style.Dim.Render("○"))
 	} else {
-		// Ensure Claude is ready before nudging (prevents race condition where
+		// Ensure agent is ready before nudging (prevents race condition where
 		// message arrives before Claude has fully started - see issue #115)
 		sessionName := getSessionFromPane(targetPane)
 		if sessionName != "" {
-			if err := ensureClaudeReady(sessionName); err != nil {
+			if err := ensureAgentReady(sessionName); err != nil {
 				// Non-fatal: warn and continue, agent will discover work via gt prime
-				fmt.Printf("%s Could not verify Claude ready: %v\n", style.Dim.Render("○"), err)
+				fmt.Printf("%s Could not verify agent ready: %v\n", style.Dim.Render("○"), err)
 			}
 		}

@@ -605,30 +605,32 @@ func getSessionFromPane(pane string) string {
 	return pane
 }

-// ensureClaudeReady waits for Claude to be ready before nudging an existing session.
-// Uses the same pragmatic approach as session.Start(): poll for node process,
-// accept bypass dialog if present, then wait for full initialization.
-// Returns early if Claude is already running and ready.
-func ensureClaudeReady(sessionName string) error {
+// ensureAgentReady waits for an agent to be ready before nudging an existing session.
+// Uses a pragmatic approach: wait for the pane to leave a shell, then (Claude-only)
+// accept the bypass permissions warning and give it a moment to finish initializing.
+func ensureAgentReady(sessionName string) error {
 	t := tmux.NewTmux()

-	// If Claude is already running, assume it's ready (session was started earlier)
-	if t.IsClaudeRunning(sessionName) {
+	// If an agent is already running, assume it's ready (session was started earlier)
+	if t.IsAgentRunning(sessionName) {
 		return nil
 	}

-	// Claude not running yet - wait for it to start (shell → node transition)
+	// Agent not running yet - wait for it to start (shell → program transition)
 	if err := t.WaitForCommand(sessionName, constants.SupportedShells, constants.ClaudeStartTimeout); err != nil {
-		return fmt.Errorf("waiting for Claude to start: %w", err)
+		return fmt.Errorf("waiting for agent to start: %w", err)
 	}

-	// Accept bypass permissions warning if present
-	_ = t.AcceptBypassPermissionsWarning(sessionName)
+	// Claude-only: accept bypass permissions warning if present
+	if t.IsClaudeRunning(sessionName) {
+		_ = t.AcceptBypassPermissionsWarning(sessionName)

-	// Wait for Claude to be fully ready at the prompt
-	// PRAGMATIC APPROACH: Use fixed delay rather than detection.
-	// Claude startup takes ~5-8 seconds on typical machines.
-	time.Sleep(8 * time.Second)
+		// PRAGMATIC APPROACH: fixed delay rather than prompt detection.
+		// Claude startup takes ~5-8 seconds on typical machines.
+		time.Sleep(8 * time.Second)
+	} else {
+		time.Sleep(1 * time.Second)
+	}

 	return nil
 }
@@ -277,7 +277,8 @@ func startConfiguredCrew(t *tmux.Tmux, townRoot string) {
 			sessionID := crewSessionName(r.Name, crewName)
 			if running, _ := t.HasSession(sessionID); running {
 				// Session exists - check if Claude is still running
-				if !t.IsClaudeRunning(sessionID) {
+				agentCfg := config.ResolveAgentConfig(townRoot, r.Path)
+				if !t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
 					// Claude has exited, restart it
 					fmt.Printf("  %s %s/%s session exists, restarting Claude...\n", style.Dim.Render("○"), r.Name, crewName)
 					claudeCmd := config.BuildCrewStartupCommand(r.Name, crewName, r.Path, "gt prime")
@@ -800,7 +801,11 @@ func runStartCrew(cmd *cobra.Command, args []string) error {

 	if hasSession {
 		// Session exists - check if Claude is still running
-		if !t.IsClaudeRunning(sessionID) {
+		agentCfg, _, err := config.ResolveAgentConfigWithOverride(townRoot, r.Path, startCrewAgentOverride)
+		if err != nil {
+			return fmt.Errorf("resolving agent: %w", err)
+		}
+		if !t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
 			// Claude has exited, restart it with "gt prime" as initial prompt
 			fmt.Printf("Session exists, restarting Claude...\n")
 			startupCmd, err := config.BuildCrewStartupCommandWithAgentOverride(rigName, name, r.Path, "gt prime", startCrewAgentOverride)
@@ -1204,6 +1204,18 @@ func BuildCrewStartupCommandWithAgentOverride(rigName, crewName, rigPath, prompt
 	return BuildStartupCommandWithAgentOverride(envVars, rigPath, prompt, agentOverride)
 }

+// ExpectedPaneCommands returns tmux pane command names that indicate the runtime is running.
+// For example, Claude runs as "node", while most other runtimes report their executable name.
+func ExpectedPaneCommands(rc *RuntimeConfig) []string {
+	if rc == nil || rc.Command == "" {
+		return nil
+	}
+	if filepath.Base(rc.Command) == "claude" {
+		return []string{"node"}
+	}
+	return []string{filepath.Base(rc.Command)}
+}
+
 // GetRigPrefix returns the beads prefix for a rig from rigs.json.
 // Falls back to "gt" if the rig isn't found or has no prefix configured.
 // townRoot is the path to the town directory (e.g., ~/gt).
@@ -1165,6 +1165,22 @@ func TestGetRuntimeCommand_UsesRigAgentWhenRigPathProvided(t *testing.T) {
 	}
 }

+func TestExpectedPaneCommands(t *testing.T) {
+	t.Run("claude maps to node", func(t *testing.T) {
+		got := ExpectedPaneCommands(&RuntimeConfig{Command: "claude"})
+		if len(got) != 1 || got[0] != "node" {
+			t.Fatalf("ExpectedPaneCommands(claude) = %v, want %v", got, []string{"node"})
+		}
+	})
+
+	t.Run("codex maps to executable", func(t *testing.T) {
+		got := ExpectedPaneCommands(&RuntimeConfig{Command: "codex"})
+		if len(got) != 1 || got[0] != "codex" {
+			t.Fatalf("ExpectedPaneCommands(codex) = %v, want %v", got, []string{"codex"})
+		}
+	})
+}
+
 func TestLoadRuntimeConfigFromSettings(t *testing.T) {
 	// Create temp rig with custom runtime config
 	dir := t.TempDir()
@@ -138,7 +138,9 @@ func (m *Manager) Start(foreground bool) error {
 	running, _ := t.HasSession(sessionID)
 	if running {
 		// Session exists - check if Claude is actually running (healthy vs zombie)
-		if t.IsClaudeRunning(sessionID) {
+		townRoot := filepath.Dir(m.rig.Path)
+		agentCfg := config.ResolveAgentConfig(townRoot, m.rig.Path)
+		if t.IsAgentRunning(sessionID, config.ExpectedPaneCommands(agentCfg)...) {
 			// Healthy - Claude is running
 			return ErrAlreadyRunning
 		}
@@ -15,8 +15,8 @@ import (

 // Common errors
 var (
-	ErrNoServer       = errors.New("no tmux server running")
-	ErrSessionExists  = errors.New("session already exists")
+	ErrNoServer        = errors.New("no tmux server running")
+	ErrSessionExists   = errors.New("session already exists")
 	ErrSessionNotFound = errors.New("session not found")
 )

@@ -94,7 +94,7 @@ func (t *Tmux) EnsureSessionFresh(name, workDir string) error {

 	if exists {
 		// Session exists - check if it's a zombie
-		if !t.IsClaudeRunning(name) {
+		if !t.IsAgentRunning(name) {
 			// Zombie session: tmux alive but Claude dead
 			// Kill it so we can create a fresh one
 			if err := t.KillSession(name); err != nil {
@@ -390,8 +390,8 @@ func (t *Tmux) GetPaneWorkDir(session string) (string, error) {

 // FindSessionByWorkDir finds tmux sessions where the pane's current working directory
 // matches or is under the target directory. Returns session names that match.
-// If checkClaude is true, only returns sessions that have Claude (node) running.
-func (t *Tmux) FindSessionByWorkDir(targetDir string, checkClaude bool) ([]string, error) {
+// If requireAgentRunning is true, only returns sessions that have some non-shell command running.
+func (t *Tmux) FindSessionByWorkDir(targetDir string, requireAgentRunning bool) ([]string, error) {
 	sessions, err := t.ListSessions()
 	if err != nil {
 		return nil, err
@@ -410,9 +410,9 @@ func (t *Tmux) FindSessionByWorkDir(targetDir string, checkClaude bool) ([]strin

 		// Check if workdir matches target (exact match or subdir)
 		if workDir == targetDir || strings.HasPrefix(workDir, targetDir+"/") {
-			if checkClaude {
-				// Only include if Claude is running
-				if t.IsClaudeRunning(session) {
+			if requireAgentRunning {
+				// Only include if an agent appears to be running
+				if t.IsAgentRunning(session) {
 					matches = append(matches, session)
 				}
 			} else {
@@ -526,15 +526,39 @@ Run: gt mail inbox
 	return t.SendKeys(session, banner)
 }

-// IsClaudeRunning checks if Claude appears to be running in the session.
-// Only trusts the pane command - UI markers in scrollback cause false positives.
-func (t *Tmux) IsClaudeRunning(session string) bool {
-	// Check pane command - Claude runs as node
+// IsAgentRunning checks if an agent appears to be running in the session.
+//
+// If expectedPaneCommands is non-empty, the pane's current command must match one of them.
+// If expectedPaneCommands is empty, any non-shell command counts as "agent running".
+func (t *Tmux) IsAgentRunning(session string, expectedPaneCommands ...string) bool {
 	cmd, err := t.GetPaneCommand(session)
 	if err != nil {
 		return false
 	}
-	return cmd == "node"
+
+	if len(expectedPaneCommands) > 0 {
+		for _, expected := range expectedPaneCommands {
+			if expected != "" && cmd == expected {
+				return true
+			}
+		}
+		return false
+	}
+
+	// Fallback: any non-shell command counts as running.
+	for _, shell := range constants.SupportedShells {
+		if cmd == shell {
+			return false
+		}
+	}
+	return cmd != ""
+}
+
+// IsClaudeRunning checks if Claude appears to be running in the session.
+// Only trusts the pane command - UI markers in scrollback cause false positives.
+func (t *Tmux) IsClaudeRunning(session string) bool {
+	// Claude runs as node
+	return t.IsAgentRunning(session, "node")
 }

 // WaitForCommand polls until the pane is NOT running one of the excluded commands.
@@ -595,14 +619,16 @@ func (t *Tmux) WaitForShellReady(session string, timeout time.Duration) error {
 // ZFC (Zero False Commands) principle: AI should observe AI, not regex.
 //
 // Bootstrap (acceptable):
-//   During cold startup when no AI agent is running, the daemon uses this
-//   function to get the Deacon online. Regex is acceptable here.
+//
+//	During cold startup when no AI agent is running, the daemon uses this
+//	function to get the Deacon online. Regex is acceptable here.
 //
 // Steady-State (use AI observation instead):
-//   Once any AI agent is running, observation should be AI-to-AI:
-//   - Deacon starting polecats → use 'gt deacon pending' + AI analysis
-//   - Deacon restarting → Mayor watches via 'gt peek'
-//   - Mayor restarting → Deacon watches via 'gt peek'
+//
+//	Once any AI agent is running, observation should be AI-to-AI:
+//	- Deacon starting polecats → use 'gt deacon pending' + AI analysis
+//	- Deacon restarting → Mayor watches via 'gt peek'
+//	- Mayor restarting → Deacon watches via 'gt peek'
 //
 // See: gt deacon pending (ZFC-compliant AI observation)
 // See: gt deacon trigger-pending (bootstrap mode, regex-based)
@@ -260,6 +260,11 @@ func TestEnsureSessionFresh_ZombieSession(t *testing.T) {
 		t.Skip("session unexpectedly has Claude running - can't test zombie case")
 	}

+	// Verify generic agent check also treats it as not running (shell session)
+	if tm.IsAgentRunning(sessionName) {
+		t.Fatalf("expected IsAgentRunning(%q) to be false for a fresh shell session", sessionName)
+	}
+
 	// EnsureSessionFresh should kill the zombie and create fresh session
 	// This should NOT error with "session already exists"
 	if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {