fix(tmux): kill process group to prevent orphaned processes
Some checks failed
CI / Check for .beads changes (push) Has been skipped
CI / Check embedded formulas (push) Failing after 18s
CI / Test (push) Failing after 1m33s
CI / Lint (push) Failing after 23s
CI / Integration Tests (push) Successful in 1m36s
CI / Coverage Report (push) Has been skipped
Windows CI / Windows Build and Unit Tests (push) Has been cancelled
Some checks failed
CI / Check for .beads changes (push) Has been skipped
CI / Check embedded formulas (push) Failing after 18s
CI / Test (push) Failing after 1m33s
CI / Lint (push) Failing after 23s
CI / Integration Tests (push) Successful in 1m36s
CI / Coverage Report (push) Has been skipped
Windows CI / Windows Build and Unit Tests (push) Has been cancelled
KillSession was leaving orphaned Claude/node processes because pgrep -P only finds direct children. Processes that reparent to init (PID 1) were missed. Changes: - Kill entire process group first using kill -TERM/-KILL -<pgid> - Add getProcessGroupID() and getProcessGroupMembers() helpers - Update KillSessionWithProcesses, KillSessionWithProcessesExcluding, and KillPaneProcesses to use process group killing - Fix EnsureSessionFresh to use KillSessionWithProcesses instead of basic KillSession Fixes: gt-w1dcvq Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -128,7 +128,8 @@ func (t *Tmux) EnsureSessionFresh(name, workDir string) error {
|
||||
if !t.IsAgentRunning(name) {
|
||||
// Zombie session: tmux alive but Claude dead
|
||||
// Kill it so we can create a fresh one
|
||||
if err := t.KillSession(name); err != nil {
|
||||
// Use KillSessionWithProcesses to ensure all descendant processes are killed
|
||||
if err := t.KillSessionWithProcesses(name); err != nil {
|
||||
return fmt.Errorf("killing zombie session: %w", err)
|
||||
}
|
||||
} else {
|
||||
@@ -156,13 +157,18 @@ const processKillGracePeriod = 2 * time.Second
|
||||
// This prevents orphan processes that survive tmux kill-session due to SIGHUP being ignored.
|
||||
//
|
||||
// Process:
|
||||
// 1. Get the pane's main process PID
|
||||
// 2. Find all descendant processes recursively (not just direct children)
|
||||
// 3. Send SIGTERM to all descendants (deepest first)
|
||||
// 4. Wait 2s for graceful shutdown
|
||||
// 5. Send SIGKILL to any remaining descendants
|
||||
// 1. Get the pane's main process PID and its process group ID (PGID)
|
||||
// 2. Kill the entire process group (catches reparented processes that stayed in the group)
|
||||
// 3. Find all descendant processes recursively (catches any stragglers)
|
||||
// 4. Send SIGTERM/SIGKILL to descendants
|
||||
// 5. Kill the pane process itself
|
||||
// 6. Kill the tmux session
|
||||
//
|
||||
// The process group kill is critical because:
|
||||
// - pgrep -P only finds direct children (PPID matching)
|
||||
// - Processes that reparent to init (PID 1) are missed by pgrep
|
||||
// - But they typically stay in the same process group unless they call setsid()
|
||||
//
|
||||
// This ensures Claude processes and all their children are properly terminated.
|
||||
func (t *Tmux) KillSessionWithProcesses(name string) error {
|
||||
// Get the pane PID
|
||||
@@ -173,7 +179,22 @@ func (t *Tmux) KillSessionWithProcesses(name string) error {
|
||||
}
|
||||
|
||||
if pid != "" {
|
||||
// Get all descendant PIDs recursively (returns deepest-first order)
|
||||
// First, kill the entire process group. This catches processes that:
|
||||
// - Reparented to init (PID 1) when their parent died
|
||||
// - Are not direct children but stayed in the same process group
|
||||
// Note: Processes that called setsid() will have a new PGID and won't be killed here
|
||||
pgid := getProcessGroupID(pid)
|
||||
if pgid != "" && pgid != "0" && pgid != "1" {
|
||||
// Kill process group with negative PGID (POSIX convention)
|
||||
// Use SIGTERM first for graceful shutdown
|
||||
_ = exec.Command("kill", "-TERM", "-"+pgid).Run()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
// Force kill any remaining processes in the group
|
||||
_ = exec.Command("kill", "-KILL", "-"+pgid).Run()
|
||||
}
|
||||
|
||||
// Also walk the process tree for any descendants that might have called setsid()
|
||||
// and created their own process groups (rare but possible)
|
||||
descendants := getAllDescendants(pid)
|
||||
|
||||
// Send SIGTERM to all descendants (deepest first to avoid orphaning)
|
||||
@@ -224,27 +245,45 @@ func (t *Tmux) KillSessionWithProcessesExcluding(name string, excludePIDs []stri
|
||||
}
|
||||
|
||||
if pid != "" {
|
||||
// Get all descendant PIDs recursively (returns deepest-first order)
|
||||
descendants := getAllDescendants(pid)
|
||||
// Get the process group ID
|
||||
pgid := getProcessGroupID(pid)
|
||||
|
||||
// Filter out excluded PIDs
|
||||
var filtered []string
|
||||
for _, dpid := range descendants {
|
||||
if !exclude[dpid] {
|
||||
filtered = append(filtered, dpid)
|
||||
// Collect all PIDs to kill (from multiple sources)
|
||||
toKill := make(map[string]bool)
|
||||
|
||||
// 1. Get all process group members (catches reparented processes)
|
||||
if pgid != "" && pgid != "0" && pgid != "1" {
|
||||
for _, member := range getProcessGroupMembers(pgid) {
|
||||
if !exclude[member] {
|
||||
toKill[member] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Send SIGTERM to all non-excluded descendants (deepest first to avoid orphaning)
|
||||
for _, dpid := range filtered {
|
||||
// 2. Get all descendant PIDs recursively (catches processes that called setsid())
|
||||
descendants := getAllDescendants(pid)
|
||||
for _, dpid := range descendants {
|
||||
if !exclude[dpid] {
|
||||
toKill[dpid] = true
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to slice for iteration
|
||||
var killList []string
|
||||
for p := range toKill {
|
||||
killList = append(killList, p)
|
||||
}
|
||||
|
||||
// Send SIGTERM to all non-excluded processes
|
||||
for _, dpid := range killList {
|
||||
_ = exec.Command("kill", "-TERM", dpid).Run()
|
||||
}
|
||||
|
||||
// Wait for graceful shutdown (2s gives processes time to clean up)
|
||||
time.Sleep(processKillGracePeriod)
|
||||
|
||||
// Send SIGKILL to any remaining non-excluded descendants
|
||||
for _, dpid := range filtered {
|
||||
// Send SIGKILL to any remaining non-excluded processes
|
||||
for _, dpid := range killList {
|
||||
_ = exec.Command("kill", "-KILL", dpid).Run()
|
||||
}
|
||||
|
||||
@@ -289,16 +328,46 @@ func getAllDescendants(pid string) []string {
|
||||
return result
|
||||
}
|
||||
|
||||
// getProcessGroupID returns the process group ID (PGID) for a given PID.
|
||||
// Returns empty string if the process doesn't exist or PGID can't be determined.
|
||||
func getProcessGroupID(pid string) string {
|
||||
out, err := exec.Command("ps", "-o", "pgid=", "-p", pid).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
// getProcessGroupMembers returns all PIDs in a process group.
|
||||
// This finds processes that share the same PGID, including those that reparented to init.
|
||||
func getProcessGroupMembers(pgid string) []string {
|
||||
// Use ps to find all processes with this PGID
|
||||
// On macOS: ps -axo pid,pgid
|
||||
// On Linux: ps -eo pid,pgid
|
||||
out, err := exec.Command("ps", "-axo", "pid,pgid").Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var members []string
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 && strings.TrimSpace(fields[1]) == pgid {
|
||||
members = append(members, strings.TrimSpace(fields[0]))
|
||||
}
|
||||
}
|
||||
return members
|
||||
}
|
||||
|
||||
// KillPaneProcesses explicitly kills all processes associated with a tmux pane.
|
||||
// This prevents orphan processes that survive pane respawn due to SIGHUP being ignored.
|
||||
//
|
||||
// Process:
|
||||
// 1. Get the pane's main process PID
|
||||
// 2. Find all descendant processes recursively (not just direct children)
|
||||
// 3. Send SIGTERM to all descendants (deepest first)
|
||||
// 4. Wait 2s for graceful shutdown
|
||||
// 5. Send SIGKILL to any remaining descendants
|
||||
// 6. Kill the pane process itself
|
||||
// 1. Get the pane's main process PID and its process group ID (PGID)
|
||||
// 2. Kill the entire process group (catches reparented processes)
|
||||
// 3. Find all descendant processes recursively (catches any stragglers)
|
||||
// 4. Send SIGTERM/SIGKILL to descendants
|
||||
// 5. Kill the pane process itself
|
||||
//
|
||||
// This ensures Claude processes and all their children are properly terminated
|
||||
// before respawning the pane.
|
||||
@@ -313,7 +382,18 @@ func (t *Tmux) KillPaneProcesses(pane string) error {
|
||||
return fmt.Errorf("pane PID is empty")
|
||||
}
|
||||
|
||||
// Get all descendant PIDs recursively (returns deepest-first order)
|
||||
// First, kill the entire process group. This catches processes that:
|
||||
// - Reparented to init (PID 1) when their parent died
|
||||
// - Are not direct children but stayed in the same process group
|
||||
pgid := getProcessGroupID(pid)
|
||||
if pgid != "" && pgid != "0" && pgid != "1" {
|
||||
// Kill process group with negative PGID (POSIX convention)
|
||||
_ = exec.Command("kill", "-TERM", "-"+pgid).Run()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
_ = exec.Command("kill", "-KILL", "-"+pgid).Run()
|
||||
}
|
||||
|
||||
// Also walk the process tree for any descendants that might have called setsid()
|
||||
descendants := getAllDescendants(pid)
|
||||
|
||||
// Send SIGTERM to all descendants (deepest first to avoid orphaning)
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
package tmux
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func hasTmux() bool {
|
||||
@@ -705,6 +708,97 @@ func TestKillSessionWithProcessesExcluding_NonexistentSession(t *testing.T) {
|
||||
_ = err
|
||||
}
|
||||
|
||||
func TestGetProcessGroupID(t *testing.T) {
|
||||
// Test with current process
|
||||
pid := fmt.Sprintf("%d", os.Getpid())
|
||||
pgid := getProcessGroupID(pid)
|
||||
|
||||
if pgid == "" {
|
||||
t.Error("expected non-empty PGID for current process")
|
||||
}
|
||||
|
||||
// PGID should not be 0 or 1 for a normal process
|
||||
if pgid == "0" || pgid == "1" {
|
||||
t.Errorf("unexpected PGID %q for current process", pgid)
|
||||
}
|
||||
|
||||
// Test with nonexistent PID
|
||||
pgid = getProcessGroupID("999999999")
|
||||
if pgid != "" {
|
||||
t.Errorf("expected empty PGID for nonexistent process, got %q", pgid)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetProcessGroupMembers(t *testing.T) {
|
||||
// Get current process's PGID
|
||||
pid := fmt.Sprintf("%d", os.Getpid())
|
||||
pgid := getProcessGroupID(pid)
|
||||
if pgid == "" {
|
||||
t.Skip("could not get PGID for current process")
|
||||
}
|
||||
|
||||
members := getProcessGroupMembers(pgid)
|
||||
|
||||
// Current process should be in the list
|
||||
found := false
|
||||
for _, m := range members {
|
||||
if m == pid {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
t.Errorf("current process %s not found in process group %s members: %v", pid, pgid, members)
|
||||
}
|
||||
}
|
||||
|
||||
func TestKillSessionWithProcesses_KillsProcessGroup(t *testing.T) {
|
||||
if !hasTmux() {
|
||||
t.Skip("tmux not installed")
|
||||
}
|
||||
|
||||
tm := NewTmux()
|
||||
sessionName := "gt-test-killpg-" + t.Name()
|
||||
|
||||
// Clean up any existing session
|
||||
_ = tm.KillSession(sessionName)
|
||||
|
||||
// Create session that spawns a child process
|
||||
// The child will stay in the same process group as the shell
|
||||
cmd := `sleep 300 & sleep 300`
|
||||
if err := tm.NewSessionWithCommand(sessionName, "", cmd); err != nil {
|
||||
t.Fatalf("NewSessionWithCommand: %v", err)
|
||||
}
|
||||
|
||||
// Give processes time to start
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
|
||||
// Verify session exists
|
||||
has, err := tm.HasSession(sessionName)
|
||||
if err != nil {
|
||||
t.Fatalf("HasSession: %v", err)
|
||||
}
|
||||
if !has {
|
||||
t.Fatal("expected session to exist after creation")
|
||||
}
|
||||
|
||||
// Kill with processes (should kill the entire process group)
|
||||
if err := tm.KillSessionWithProcesses(sessionName); err != nil {
|
||||
t.Fatalf("KillSessionWithProcesses: %v", err)
|
||||
}
|
||||
|
||||
// Verify session is gone
|
||||
has, err = tm.HasSession(sessionName)
|
||||
if err != nil {
|
||||
t.Fatalf("HasSession after kill: %v", err)
|
||||
}
|
||||
if has {
|
||||
t.Error("expected session to not exist after KillSessionWithProcesses")
|
||||
_ = tm.KillSession(sessionName) // cleanup
|
||||
}
|
||||
}
|
||||
|
||||
func TestSessionSet(t *testing.T) {
|
||||
if !hasTmux() {
|
||||
t.Skip("tmux not installed")
|
||||
|
||||
Reference in New Issue
Block a user