fix(tmux): use KillSessionWithProcesses to prevent zombie bash processes
When Claude sessions were terminated using KillSession(), bash subprocesses spawned by Claude's Bash tool could survive because they ignore SIGHUP. This caused zombie processes to accumulate over time. Changed all critical session termination paths to use KillSessionWithProcesses() which explicitly kills all descendant processes before terminating the session. Fixes: gt-ew3tk Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -160,9 +160,10 @@ func (b *Boot) Spawn(agentOverride string) error {
|
||||
|
||||
// spawnTmux spawns Boot in a tmux session.
|
||||
func (b *Boot) spawnTmux(agentOverride string) error {
|
||||
// Kill any stale session first
|
||||
// Kill any stale session first.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if b.IsSessionAlive() {
|
||||
_ = b.tmux.KillSession(SessionName)
|
||||
_ = b.tmux.KillSessionWithProcesses(SessionName)
|
||||
}
|
||||
|
||||
// Ensure boot directory exists (it should have CLAUDE.md with Boot context)
|
||||
|
||||
@@ -301,9 +301,10 @@ func runDegradedTriage(b *boot.Boot) (action, target string, err error) {
|
||||
// Nudge the session to try to wake it up
|
||||
age := hb.Age()
|
||||
if age > 30*time.Minute {
|
||||
// Very stuck - restart the session
|
||||
// Very stuck - restart the session.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
fmt.Printf("Deacon heartbeat is %s old - restarting session\n", age.Round(time.Minute))
|
||||
if err := tm.KillSession(deaconSession); err == nil {
|
||||
if err := tm.KillSessionWithProcesses(deaconSession); err == nil {
|
||||
return "restart", "deacon-stuck", nil
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -28,11 +28,12 @@ func runCrewRename(cmd *cobra.Command, args []string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Kill any running session for the old name
|
||||
// Kill any running session for the old name.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
t := tmux.NewTmux()
|
||||
oldSessionID := crewSessionName(r.Name, oldName)
|
||||
if hasSession, _ := t.HasSession(oldSessionID); hasSession {
|
||||
if err := t.KillSession(oldSessionID); err != nil {
|
||||
if err := t.KillSessionWithProcesses(oldSessionID); err != nil {
|
||||
return fmt.Errorf("killing old session: %w", err)
|
||||
}
|
||||
fmt.Printf("Killed session %s\n", oldSessionID)
|
||||
|
||||
@@ -491,8 +491,9 @@ func runDeaconStop(cmd *cobra.Command, args []string) error {
|
||||
_ = t.SendKeysRaw(sessionName, "C-c")
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
// Kill the session
|
||||
if err := t.KillSession(sessionName); err != nil {
|
||||
// Kill the session.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if err := t.KillSessionWithProcesses(sessionName); err != nil {
|
||||
return fmt.Errorf("killing session: %w", err)
|
||||
}
|
||||
|
||||
@@ -592,8 +593,9 @@ func runDeaconRestart(cmd *cobra.Command, args []string) error {
|
||||
fmt.Println("Restarting Deacon...")
|
||||
|
||||
if running {
|
||||
// Kill existing session
|
||||
if err := t.KillSession(sessionName); err != nil {
|
||||
// Kill existing session.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if err := t.KillSessionWithProcesses(sessionName); err != nil {
|
||||
style.PrintWarning("failed to kill session: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -876,9 +878,10 @@ func runDeaconForceKill(cmd *cobra.Command, args []string) error {
|
||||
mailBody := fmt.Sprintf("Deacon detected %s as unresponsive.\nReason: %s\nAction: force-killing session", agent, reason)
|
||||
sendMail(townRoot, agent, "FORCE_KILL: unresponsive", mailBody)
|
||||
|
||||
// Step 2: Kill the tmux session
|
||||
// Step 2: Kill the tmux session.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
fmt.Printf("%s Killing tmux session %s...\n", style.Dim.Render("2."), sessionName)
|
||||
if err := t.KillSession(sessionName); err != nil {
|
||||
if err := t.KillSessionWithProcesses(sessionName); err != nil {
|
||||
return fmt.Errorf("killing session: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -192,12 +192,13 @@ func runWitnessStop(cmd *cobra.Command, args []string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Kill tmux session if it exists
|
||||
// Kill tmux session if it exists.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
t := tmux.NewTmux()
|
||||
sessionName := witnessSessionName(rigName)
|
||||
running, _ := t.HasSession(sessionName)
|
||||
if running {
|
||||
if err := t.KillSession(sessionName); err != nil {
|
||||
if err := t.KillSessionWithProcesses(sessionName); err != nil {
|
||||
style.PrintWarning("failed to kill session: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,6 +62,7 @@ type Connection interface {
|
||||
TmuxNewSession(name, dir string) error
|
||||
|
||||
// TmuxKillSession terminates the named tmux session.
|
||||
// Uses KillSessionWithProcesses internally to ensure all descendant processes are killed.
|
||||
TmuxKillSession(name string) error
|
||||
|
||||
// TmuxSendKeys sends keys to the named tmux session.
|
||||
|
||||
@@ -161,8 +161,9 @@ func (c *LocalConnection) TmuxNewSession(name, dir string) error {
|
||||
}
|
||||
|
||||
// TmuxKillSession terminates a tmux session.
|
||||
// Uses KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
func (c *LocalConnection) TmuxKillSession(name string) error {
|
||||
return c.tmux.KillSession(name)
|
||||
return c.tmux.KillSessionWithProcesses(name)
|
||||
}
|
||||
|
||||
// TmuxSendKeys sends keys to a tmux session.
|
||||
|
||||
@@ -470,8 +470,9 @@ func (m *Manager) Start(name string, opts StartOptions) error {
|
||||
}
|
||||
if running {
|
||||
if opts.KillExisting {
|
||||
// Restart mode - kill existing session
|
||||
if err := t.KillSession(sessionID); err != nil {
|
||||
// Restart mode - kill existing session.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if err := t.KillSessionWithProcesses(sessionID); err != nil {
|
||||
return fmt.Errorf("killing existing session: %w", err)
|
||||
}
|
||||
} else {
|
||||
@@ -479,8 +480,9 @@ func (m *Manager) Start(name string, opts StartOptions) error {
|
||||
if t.IsClaudeRunning(sessionID) {
|
||||
return fmt.Errorf("%w: %s", ErrSessionRunning, sessionID)
|
||||
}
|
||||
// Zombie session - kill and recreate
|
||||
if err := t.KillSession(sessionID); err != nil {
|
||||
// Zombie session - kill and recreate.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if err := t.KillSessionWithProcesses(sessionID); err != nil {
|
||||
return fmt.Errorf("killing zombie session: %w", err)
|
||||
}
|
||||
}
|
||||
@@ -573,8 +575,10 @@ func (m *Manager) Stop(name string) error {
|
||||
return ErrSessionNotFound
|
||||
}
|
||||
|
||||
// Kill the session
|
||||
if err := t.KillSession(sessionID); err != nil {
|
||||
// Kill the session.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
// This prevents orphan bash processes from Claude's Bash tool surviving session termination.
|
||||
if err := t.KillSessionWithProcesses(sessionID); err != nil {
|
||||
return fmt.Errorf("killing session: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -430,9 +430,10 @@ func (d *Daemon) checkDeaconHeartbeat() {
|
||||
|
||||
// Session exists but heartbeat is stale - Deacon is stuck
|
||||
if age > 30*time.Minute {
|
||||
// Very stuck - restart the session
|
||||
// Very stuck - restart the session.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
d.logger.Printf("Deacon stuck for %s - restarting session", age.Round(time.Minute))
|
||||
if err := d.tmux.KillSession(sessionName); err != nil {
|
||||
if err := d.tmux.KillSessionWithProcesses(sessionName); err != nil {
|
||||
d.logger.Printf("Error killing stuck Deacon: %v", err)
|
||||
}
|
||||
// ensureDeaconRunning will restart on next heartbeat
|
||||
|
||||
@@ -179,7 +179,9 @@ func (d *Daemon) executeLifecycleAction(request *LifecycleRequest) error {
|
||||
switch request.Action {
|
||||
case ActionShutdown:
|
||||
if running {
|
||||
if err := d.tmux.KillSession(sessionName); err != nil {
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
// This prevents orphan bash processes from Claude's Bash tool surviving session termination.
|
||||
if err := d.tmux.KillSessionWithProcesses(sessionName); err != nil {
|
||||
return fmt.Errorf("killing session: %w", err)
|
||||
}
|
||||
d.logger.Printf("Killed session %s", sessionName)
|
||||
@@ -188,8 +190,8 @@ func (d *Daemon) executeLifecycleAction(request *LifecycleRequest) error {
|
||||
|
||||
case ActionCycle, ActionRestart:
|
||||
if running {
|
||||
// Kill the session first
|
||||
if err := d.tmux.KillSession(sessionName); err != nil {
|
||||
// Kill the session first - use KillSessionWithProcesses to prevent orphan processes.
|
||||
if err := d.tmux.KillSessionWithProcesses(sessionName); err != nil {
|
||||
return fmt.Errorf("killing session: %w", err)
|
||||
}
|
||||
d.logger.Printf("Killed session %s for restart", sessionName)
|
||||
|
||||
@@ -63,7 +63,8 @@ func (m *Manager) Start(agentOverride string) error {
|
||||
return ErrAlreadyRunning
|
||||
}
|
||||
// Zombie - tmux alive but Claude dead. Kill and recreate.
|
||||
if err := t.KillSession(sessionID); err != nil {
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if err := t.KillSessionWithProcesses(sessionID); err != nil {
|
||||
return fmt.Errorf("killing zombie session: %w", err)
|
||||
}
|
||||
}
|
||||
@@ -154,8 +155,10 @@ func (m *Manager) Stop() error {
|
||||
_ = t.SendKeysRaw(sessionID, "C-c")
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
// Kill the session
|
||||
if err := t.KillSession(sessionID); err != nil {
|
||||
// Kill the session.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
// This prevents orphan bash processes from Claude's Bash tool surviving session termination.
|
||||
if err := t.KillSessionWithProcesses(sessionID); err != nil {
|
||||
return fmt.Errorf("killing session: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -510,8 +510,9 @@ func (c *ClaudeSettingsCheck) Fix(ctx *CheckContext) error {
|
||||
sf.agentType == "deacon" || sf.agentType == "mayor" {
|
||||
running, _ := t.HasSession(sf.sessionName)
|
||||
if running {
|
||||
// Cycle the agent by killing and letting gt up restart it
|
||||
_ = t.KillSession(sf.sessionName)
|
||||
// Cycle the agent by killing and letting gt up restart it.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
_ = t.KillSessionWithProcesses(sf.sessionName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,7 +149,8 @@ func (c *OrphanSessionCheck) Fix(ctx *CheckContext) error {
|
||||
// Log pre-death event for crash investigation (before killing)
|
||||
_ = events.LogFeed(events.TypeSessionDeath, sess,
|
||||
events.SessionDeathPayload(sess, "unknown", "orphan cleanup", "gt doctor"))
|
||||
if err := t.KillSession(sess); err != nil {
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if err := t.KillSessionWithProcesses(sess); err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,7 +123,8 @@ func (c *LinkedPaneCheck) Fix(ctx *CheckContext) error {
|
||||
var lastErr error
|
||||
|
||||
for _, session := range c.linkedSessions {
|
||||
if err := t.KillSession(session); err != nil {
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if err := t.KillSessionWithProcesses(session); err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,7 +128,8 @@ func (c *ZombieSessionCheck) Fix(ctx *CheckContext) error {
|
||||
_ = events.LogFeed(events.TypeSessionDeath, sess,
|
||||
events.SessionDeathPayload(sess, "unknown", "zombie cleanup", "gt doctor"))
|
||||
|
||||
if err := t.KillSession(sess); err != nil {
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if err := t.KillSessionWithProcesses(sess); err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -729,12 +729,13 @@ func (m *Manager) ReconcilePoolWith(namesWithDirs, namesWithSessions []string) {
|
||||
dirSet[name] = true
|
||||
}
|
||||
|
||||
// Kill orphaned sessions (session exists but no directory)
|
||||
// Kill orphaned sessions (session exists but no directory).
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if m.tmux != nil {
|
||||
for _, name := range namesWithSessions {
|
||||
if !dirSet[name] {
|
||||
sessionName := fmt.Sprintf("gt-%s-%s", m.rig.Name, name)
|
||||
_ = m.tmux.KillSession(sessionName)
|
||||
_ = m.tmux.KillSessionWithProcesses(sessionName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -289,7 +289,9 @@ func (m *SessionManager) Stop(polecat string, force bool) error {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
|
||||
if err := m.tmux.KillSession(sessionID); err != nil {
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
// This prevents orphan bash processes from Claude's Bash tool surviving session termination.
|
||||
if err := m.tmux.KillSessionWithProcesses(sessionID); err != nil {
|
||||
return fmt.Errorf("killing session: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -68,8 +68,9 @@ func stopTownSessionInternal(t *tmux.Tmux, ts TownSession, force bool) (bool, er
|
||||
_ = events.LogFeed(events.TypeSessionDeath, ts.SessionID,
|
||||
events.SessionDeathPayload(ts.SessionID, ts.Name, reason, "gt down"))
|
||||
|
||||
// Kill the session
|
||||
if err := t.KillSession(ts.SessionID); err != nil {
|
||||
// Kill the session.
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
||||
if err := t.KillSessionWithProcesses(ts.SessionID); err != nil {
|
||||
return false, fmt.Errorf("killing %s session: %w", ts.Name, err)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user