fix(tmux): use KillSessionWithProcesses to prevent zombie bash processes

When Claude sessions were terminated using KillSession(), bash subprocesses
spawned by Claude's Bash tool could survive because they ignore SIGHUP.
This caused zombie processes to accumulate over time.

Changed all critical session termination paths to use KillSessionWithProcesses()
which explicitly kills all descendant processes before terminating the session.

Fixes: gt-ew3tk

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
slit
2026-01-20 20:37:34 -08:00
committed by beads/crew/emma
parent 78ca8bd5bf
commit 9caf5302d4
18 changed files with 66 additions and 39 deletions

View File

@@ -160,9 +160,10 @@ func (b *Boot) Spawn(agentOverride string) error {
// spawnTmux spawns Boot in a tmux session.
func (b *Boot) spawnTmux(agentOverride string) error {
// Kill any stale session first
// Kill any stale session first.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if b.IsSessionAlive() {
_ = b.tmux.KillSession(SessionName)
_ = b.tmux.KillSessionWithProcesses(SessionName)
}
// Ensure boot directory exists (it should have CLAUDE.md with Boot context)

View File

@@ -301,9 +301,10 @@ func runDegradedTriage(b *boot.Boot) (action, target string, err error) {
// Nudge the session to try to wake it up
age := hb.Age()
if age > 30*time.Minute {
// Very stuck - restart the session
// Very stuck - restart the session.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
fmt.Printf("Deacon heartbeat is %s old - restarting session\n", age.Round(time.Minute))
if err := tm.KillSession(deaconSession); err == nil {
if err := tm.KillSessionWithProcesses(deaconSession); err == nil {
return "restart", "deacon-stuck", nil
}
} else {

View File

@@ -28,11 +28,12 @@ func runCrewRename(cmd *cobra.Command, args []string) error {
return err
}
// Kill any running session for the old name
// Kill any running session for the old name.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
t := tmux.NewTmux()
oldSessionID := crewSessionName(r.Name, oldName)
if hasSession, _ := t.HasSession(oldSessionID); hasSession {
if err := t.KillSession(oldSessionID); err != nil {
if err := t.KillSessionWithProcesses(oldSessionID); err != nil {
return fmt.Errorf("killing old session: %w", err)
}
fmt.Printf("Killed session %s\n", oldSessionID)

View File

@@ -491,8 +491,9 @@ func runDeaconStop(cmd *cobra.Command, args []string) error {
_ = t.SendKeysRaw(sessionName, "C-c")
time.Sleep(100 * time.Millisecond)
// Kill the session
if err := t.KillSession(sessionName); err != nil {
// Kill the session.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if err := t.KillSessionWithProcesses(sessionName); err != nil {
return fmt.Errorf("killing session: %w", err)
}
@@ -592,8 +593,9 @@ func runDeaconRestart(cmd *cobra.Command, args []string) error {
fmt.Println("Restarting Deacon...")
if running {
// Kill existing session
if err := t.KillSession(sessionName); err != nil {
// Kill existing session.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if err := t.KillSessionWithProcesses(sessionName); err != nil {
style.PrintWarning("failed to kill session: %v", err)
}
}
@@ -876,9 +878,10 @@ func runDeaconForceKill(cmd *cobra.Command, args []string) error {
mailBody := fmt.Sprintf("Deacon detected %s as unresponsive.\nReason: %s\nAction: force-killing session", agent, reason)
sendMail(townRoot, agent, "FORCE_KILL: unresponsive", mailBody)
// Step 2: Kill the tmux session
// Step 2: Kill the tmux session.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
fmt.Printf("%s Killing tmux session %s...\n", style.Dim.Render("2."), sessionName)
if err := t.KillSession(sessionName); err != nil {
if err := t.KillSessionWithProcesses(sessionName); err != nil {
return fmt.Errorf("killing session: %w", err)
}

View File

@@ -192,12 +192,13 @@ func runWitnessStop(cmd *cobra.Command, args []string) error {
return err
}
// Kill tmux session if it exists
// Kill tmux session if it exists.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
t := tmux.NewTmux()
sessionName := witnessSessionName(rigName)
running, _ := t.HasSession(sessionName)
if running {
if err := t.KillSession(sessionName); err != nil {
if err := t.KillSessionWithProcesses(sessionName); err != nil {
style.PrintWarning("failed to kill session: %v", err)
}
}

View File

@@ -62,6 +62,7 @@ type Connection interface {
TmuxNewSession(name, dir string) error
// TmuxKillSession terminates the named tmux session.
// Uses KillSessionWithProcesses internally to ensure all descendant processes are killed.
TmuxKillSession(name string) error
// TmuxSendKeys sends keys to the named tmux session.

View File

@@ -161,8 +161,9 @@ func (c *LocalConnection) TmuxNewSession(name, dir string) error {
}
// TmuxKillSession terminates a tmux session.
// Uses KillSessionWithProcesses to ensure all descendant processes are killed.
func (c *LocalConnection) TmuxKillSession(name string) error {
return c.tmux.KillSession(name)
return c.tmux.KillSessionWithProcesses(name)
}
// TmuxSendKeys sends keys to a tmux session.

View File

@@ -470,8 +470,9 @@ func (m *Manager) Start(name string, opts StartOptions) error {
}
if running {
if opts.KillExisting {
// Restart mode - kill existing session
if err := t.KillSession(sessionID); err != nil {
// Restart mode - kill existing session.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if err := t.KillSessionWithProcesses(sessionID); err != nil {
return fmt.Errorf("killing existing session: %w", err)
}
} else {
@@ -479,8 +480,9 @@ func (m *Manager) Start(name string, opts StartOptions) error {
if t.IsClaudeRunning(sessionID) {
return fmt.Errorf("%w: %s", ErrSessionRunning, sessionID)
}
// Zombie session - kill and recreate
if err := t.KillSession(sessionID); err != nil {
// Zombie session - kill and recreate.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if err := t.KillSessionWithProcesses(sessionID); err != nil {
return fmt.Errorf("killing zombie session: %w", err)
}
}
@@ -573,8 +575,10 @@ func (m *Manager) Stop(name string) error {
return ErrSessionNotFound
}
// Kill the session
if err := t.KillSession(sessionID); err != nil {
// Kill the session.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
// This prevents orphan bash processes from Claude's Bash tool surviving session termination.
if err := t.KillSessionWithProcesses(sessionID); err != nil {
return fmt.Errorf("killing session: %w", err)
}

View File

@@ -430,9 +430,10 @@ func (d *Daemon) checkDeaconHeartbeat() {
// Session exists but heartbeat is stale - Deacon is stuck
if age > 30*time.Minute {
// Very stuck - restart the session
// Very stuck - restart the session.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
d.logger.Printf("Deacon stuck for %s - restarting session", age.Round(time.Minute))
if err := d.tmux.KillSession(sessionName); err != nil {
if err := d.tmux.KillSessionWithProcesses(sessionName); err != nil {
d.logger.Printf("Error killing stuck Deacon: %v", err)
}
// ensureDeaconRunning will restart on next heartbeat

View File

@@ -179,7 +179,9 @@ func (d *Daemon) executeLifecycleAction(request *LifecycleRequest) error {
switch request.Action {
case ActionShutdown:
if running {
if err := d.tmux.KillSession(sessionName); err != nil {
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
// This prevents orphan bash processes from Claude's Bash tool surviving session termination.
if err := d.tmux.KillSessionWithProcesses(sessionName); err != nil {
return fmt.Errorf("killing session: %w", err)
}
d.logger.Printf("Killed session %s", sessionName)
@@ -188,8 +190,8 @@ func (d *Daemon) executeLifecycleAction(request *LifecycleRequest) error {
case ActionCycle, ActionRestart:
if running {
// Kill the session first
if err := d.tmux.KillSession(sessionName); err != nil {
// Kill the session first - use KillSessionWithProcesses to prevent orphan processes.
if err := d.tmux.KillSessionWithProcesses(sessionName); err != nil {
return fmt.Errorf("killing session: %w", err)
}
d.logger.Printf("Killed session %s for restart", sessionName)

View File

@@ -63,7 +63,8 @@ func (m *Manager) Start(agentOverride string) error {
return ErrAlreadyRunning
}
// Zombie - tmux alive but Claude dead. Kill and recreate.
if err := t.KillSession(sessionID); err != nil {
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if err := t.KillSessionWithProcesses(sessionID); err != nil {
return fmt.Errorf("killing zombie session: %w", err)
}
}
@@ -154,8 +155,10 @@ func (m *Manager) Stop() error {
_ = t.SendKeysRaw(sessionID, "C-c")
time.Sleep(100 * time.Millisecond)
// Kill the session
if err := t.KillSession(sessionID); err != nil {
// Kill the session.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
// This prevents orphan bash processes from Claude's Bash tool surviving session termination.
if err := t.KillSessionWithProcesses(sessionID); err != nil {
return fmt.Errorf("killing session: %w", err)
}

View File

@@ -510,8 +510,9 @@ func (c *ClaudeSettingsCheck) Fix(ctx *CheckContext) error {
sf.agentType == "deacon" || sf.agentType == "mayor" {
running, _ := t.HasSession(sf.sessionName)
if running {
// Cycle the agent by killing and letting gt up restart it
_ = t.KillSession(sf.sessionName)
// Cycle the agent by killing and letting gt up restart it.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
_ = t.KillSessionWithProcesses(sf.sessionName)
}
}
}

View File

@@ -149,7 +149,8 @@ func (c *OrphanSessionCheck) Fix(ctx *CheckContext) error {
// Log pre-death event for crash investigation (before killing)
_ = events.LogFeed(events.TypeSessionDeath, sess,
events.SessionDeathPayload(sess, "unknown", "orphan cleanup", "gt doctor"))
if err := t.KillSession(sess); err != nil {
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if err := t.KillSessionWithProcesses(sess); err != nil {
lastErr = err
}
}

View File

@@ -123,7 +123,8 @@ func (c *LinkedPaneCheck) Fix(ctx *CheckContext) error {
var lastErr error
for _, session := range c.linkedSessions {
if err := t.KillSession(session); err != nil {
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if err := t.KillSessionWithProcesses(session); err != nil {
lastErr = err
}
}

View File

@@ -128,7 +128,8 @@ func (c *ZombieSessionCheck) Fix(ctx *CheckContext) error {
_ = events.LogFeed(events.TypeSessionDeath, sess,
events.SessionDeathPayload(sess, "unknown", "zombie cleanup", "gt doctor"))
if err := t.KillSession(sess); err != nil {
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if err := t.KillSessionWithProcesses(sess); err != nil {
lastErr = err
}
}

View File

@@ -729,12 +729,13 @@ func (m *Manager) ReconcilePoolWith(namesWithDirs, namesWithSessions []string) {
dirSet[name] = true
}
// Kill orphaned sessions (session exists but no directory)
// Kill orphaned sessions (session exists but no directory).
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if m.tmux != nil {
for _, name := range namesWithSessions {
if !dirSet[name] {
sessionName := fmt.Sprintf("gt-%s-%s", m.rig.Name, name)
_ = m.tmux.KillSession(sessionName)
_ = m.tmux.KillSessionWithProcesses(sessionName)
}
}
}

View File

@@ -289,7 +289,9 @@ func (m *SessionManager) Stop(polecat string, force bool) error {
time.Sleep(100 * time.Millisecond)
}
if err := m.tmux.KillSession(sessionID); err != nil {
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
// This prevents orphan bash processes from Claude's Bash tool surviving session termination.
if err := m.tmux.KillSessionWithProcesses(sessionID); err != nil {
return fmt.Errorf("killing session: %w", err)
}

View File

@@ -68,8 +68,9 @@ func stopTownSessionInternal(t *tmux.Tmux, ts TownSession, force bool) (bool, er
_ = events.LogFeed(events.TypeSessionDeath, ts.SessionID,
events.SessionDeathPayload(ts.SessionID, ts.Name, reason, "gt down"))
// Kill the session
if err := t.KillSession(ts.SessionID); err != nil {
// Kill the session.
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
if err := t.KillSessionWithProcesses(ts.SessionID); err != nil {
return false, fmt.Errorf("killing %s session: %w", ts.Name, err)
}