fix(tmux): kill process group to prevent orphaned processes
Some checks failed
CI / Check for .beads changes (push) Has been skipped
CI / Check embedded formulas (push) Failing after 18s
CI / Test (push) Failing after 1m33s
CI / Lint (push) Failing after 23s
CI / Integration Tests (push) Successful in 1m36s
CI / Coverage Report (push) Has been skipped
Windows CI / Windows Build and Unit Tests (push) Has been cancelled
Some checks failed
CI / Check for .beads changes (push) Has been skipped
CI / Check embedded formulas (push) Failing after 18s
CI / Test (push) Failing after 1m33s
CI / Lint (push) Failing after 23s
CI / Integration Tests (push) Successful in 1m36s
CI / Coverage Report (push) Has been skipped
Windows CI / Windows Build and Unit Tests (push) Has been cancelled
KillSession was leaving orphaned Claude/node processes because pgrep -P only finds direct children. Processes that reparent to init (PID 1) were missed. Changes: - Kill entire process group first using kill -TERM/-KILL -<pgid> - Add getProcessGroupID() and getProcessGroupMembers() helpers - Update KillSessionWithProcesses, KillSessionWithProcessesExcluding, and KillPaneProcesses to use process group killing - Fix EnsureSessionFresh to use KillSessionWithProcesses instead of basic KillSession Fixes: gt-w1dcvq Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -128,7 +128,8 @@ func (t *Tmux) EnsureSessionFresh(name, workDir string) error {
|
|||||||
if !t.IsAgentRunning(name) {
|
if !t.IsAgentRunning(name) {
|
||||||
// Zombie session: tmux alive but Claude dead
|
// Zombie session: tmux alive but Claude dead
|
||||||
// Kill it so we can create a fresh one
|
// Kill it so we can create a fresh one
|
||||||
if err := t.KillSession(name); err != nil {
|
// Use KillSessionWithProcesses to ensure all descendant processes are killed
|
||||||
|
if err := t.KillSessionWithProcesses(name); err != nil {
|
||||||
return fmt.Errorf("killing zombie session: %w", err)
|
return fmt.Errorf("killing zombie session: %w", err)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -156,13 +157,18 @@ const processKillGracePeriod = 2 * time.Second
|
|||||||
// This prevents orphan processes that survive tmux kill-session due to SIGHUP being ignored.
|
// This prevents orphan processes that survive tmux kill-session due to SIGHUP being ignored.
|
||||||
//
|
//
|
||||||
// Process:
|
// Process:
|
||||||
// 1. Get the pane's main process PID
|
// 1. Get the pane's main process PID and its process group ID (PGID)
|
||||||
// 2. Find all descendant processes recursively (not just direct children)
|
// 2. Kill the entire process group (catches reparented processes that stayed in the group)
|
||||||
// 3. Send SIGTERM to all descendants (deepest first)
|
// 3. Find all descendant processes recursively (catches any stragglers)
|
||||||
// 4. Wait 2s for graceful shutdown
|
// 4. Send SIGTERM/SIGKILL to descendants
|
||||||
// 5. Send SIGKILL to any remaining descendants
|
// 5. Kill the pane process itself
|
||||||
// 6. Kill the tmux session
|
// 6. Kill the tmux session
|
||||||
//
|
//
|
||||||
|
// The process group kill is critical because:
|
||||||
|
// - pgrep -P only finds direct children (PPID matching)
|
||||||
|
// - Processes that reparent to init (PID 1) are missed by pgrep
|
||||||
|
// - But they typically stay in the same process group unless they call setsid()
|
||||||
|
//
|
||||||
// This ensures Claude processes and all their children are properly terminated.
|
// This ensures Claude processes and all their children are properly terminated.
|
||||||
func (t *Tmux) KillSessionWithProcesses(name string) error {
|
func (t *Tmux) KillSessionWithProcesses(name string) error {
|
||||||
// Get the pane PID
|
// Get the pane PID
|
||||||
@@ -173,7 +179,22 @@ func (t *Tmux) KillSessionWithProcesses(name string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if pid != "" {
|
if pid != "" {
|
||||||
// Get all descendant PIDs recursively (returns deepest-first order)
|
// First, kill the entire process group. This catches processes that:
|
||||||
|
// - Reparented to init (PID 1) when their parent died
|
||||||
|
// - Are not direct children but stayed in the same process group
|
||||||
|
// Note: Processes that called setsid() will have a new PGID and won't be killed here
|
||||||
|
pgid := getProcessGroupID(pid)
|
||||||
|
if pgid != "" && pgid != "0" && pgid != "1" {
|
||||||
|
// Kill process group with negative PGID (POSIX convention)
|
||||||
|
// Use SIGTERM first for graceful shutdown
|
||||||
|
_ = exec.Command("kill", "-TERM", "-"+pgid).Run()
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
// Force kill any remaining processes in the group
|
||||||
|
_ = exec.Command("kill", "-KILL", "-"+pgid).Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also walk the process tree for any descendants that might have called setsid()
|
||||||
|
// and created their own process groups (rare but possible)
|
||||||
descendants := getAllDescendants(pid)
|
descendants := getAllDescendants(pid)
|
||||||
|
|
||||||
// Send SIGTERM to all descendants (deepest first to avoid orphaning)
|
// Send SIGTERM to all descendants (deepest first to avoid orphaning)
|
||||||
@@ -224,27 +245,45 @@ func (t *Tmux) KillSessionWithProcessesExcluding(name string, excludePIDs []stri
|
|||||||
}
|
}
|
||||||
|
|
||||||
if pid != "" {
|
if pid != "" {
|
||||||
// Get all descendant PIDs recursively (returns deepest-first order)
|
// Get the process group ID
|
||||||
descendants := getAllDescendants(pid)
|
pgid := getProcessGroupID(pid)
|
||||||
|
|
||||||
// Filter out excluded PIDs
|
// Collect all PIDs to kill (from multiple sources)
|
||||||
var filtered []string
|
toKill := make(map[string]bool)
|
||||||
|
|
||||||
|
// 1. Get all process group members (catches reparented processes)
|
||||||
|
if pgid != "" && pgid != "0" && pgid != "1" {
|
||||||
|
for _, member := range getProcessGroupMembers(pgid) {
|
||||||
|
if !exclude[member] {
|
||||||
|
toKill[member] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Get all descendant PIDs recursively (catches processes that called setsid())
|
||||||
|
descendants := getAllDescendants(pid)
|
||||||
for _, dpid := range descendants {
|
for _, dpid := range descendants {
|
||||||
if !exclude[dpid] {
|
if !exclude[dpid] {
|
||||||
filtered = append(filtered, dpid)
|
toKill[dpid] = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send SIGTERM to all non-excluded descendants (deepest first to avoid orphaning)
|
// Convert to slice for iteration
|
||||||
for _, dpid := range filtered {
|
var killList []string
|
||||||
|
for p := range toKill {
|
||||||
|
killList = append(killList, p)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send SIGTERM to all non-excluded processes
|
||||||
|
for _, dpid := range killList {
|
||||||
_ = exec.Command("kill", "-TERM", dpid).Run()
|
_ = exec.Command("kill", "-TERM", dpid).Run()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for graceful shutdown (2s gives processes time to clean up)
|
// Wait for graceful shutdown (2s gives processes time to clean up)
|
||||||
time.Sleep(processKillGracePeriod)
|
time.Sleep(processKillGracePeriod)
|
||||||
|
|
||||||
// Send SIGKILL to any remaining non-excluded descendants
|
// Send SIGKILL to any remaining non-excluded processes
|
||||||
for _, dpid := range filtered {
|
for _, dpid := range killList {
|
||||||
_ = exec.Command("kill", "-KILL", dpid).Run()
|
_ = exec.Command("kill", "-KILL", dpid).Run()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -289,16 +328,46 @@ func getAllDescendants(pid string) []string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getProcessGroupID returns the process group ID (PGID) for a given PID.
|
||||||
|
// Returns empty string if the process doesn't exist or PGID can't be determined.
|
||||||
|
func getProcessGroupID(pid string) string {
|
||||||
|
out, err := exec.Command("ps", "-o", "pgid=", "-p", pid).Output()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
// getProcessGroupMembers returns all PIDs in a process group.
|
||||||
|
// This finds processes that share the same PGID, including those that reparented to init.
|
||||||
|
func getProcessGroupMembers(pgid string) []string {
|
||||||
|
// Use ps to find all processes with this PGID
|
||||||
|
// On macOS: ps -axo pid,pgid
|
||||||
|
// On Linux: ps -eo pid,pgid
|
||||||
|
out, err := exec.Command("ps", "-axo", "pid,pgid").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var members []string
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) >= 2 && strings.TrimSpace(fields[1]) == pgid {
|
||||||
|
members = append(members, strings.TrimSpace(fields[0]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return members
|
||||||
|
}
|
||||||
|
|
||||||
// KillPaneProcesses explicitly kills all processes associated with a tmux pane.
|
// KillPaneProcesses explicitly kills all processes associated with a tmux pane.
|
||||||
// This prevents orphan processes that survive pane respawn due to SIGHUP being ignored.
|
// This prevents orphan processes that survive pane respawn due to SIGHUP being ignored.
|
||||||
//
|
//
|
||||||
// Process:
|
// Process:
|
||||||
// 1. Get the pane's main process PID
|
// 1. Get the pane's main process PID and its process group ID (PGID)
|
||||||
// 2. Find all descendant processes recursively (not just direct children)
|
// 2. Kill the entire process group (catches reparented processes)
|
||||||
// 3. Send SIGTERM to all descendants (deepest first)
|
// 3. Find all descendant processes recursively (catches any stragglers)
|
||||||
// 4. Wait 2s for graceful shutdown
|
// 4. Send SIGTERM/SIGKILL to descendants
|
||||||
// 5. Send SIGKILL to any remaining descendants
|
// 5. Kill the pane process itself
|
||||||
// 6. Kill the pane process itself
|
|
||||||
//
|
//
|
||||||
// This ensures Claude processes and all their children are properly terminated
|
// This ensures Claude processes and all their children are properly terminated
|
||||||
// before respawning the pane.
|
// before respawning the pane.
|
||||||
@@ -313,7 +382,18 @@ func (t *Tmux) KillPaneProcesses(pane string) error {
|
|||||||
return fmt.Errorf("pane PID is empty")
|
return fmt.Errorf("pane PID is empty")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get all descendant PIDs recursively (returns deepest-first order)
|
// First, kill the entire process group. This catches processes that:
|
||||||
|
// - Reparented to init (PID 1) when their parent died
|
||||||
|
// - Are not direct children but stayed in the same process group
|
||||||
|
pgid := getProcessGroupID(pid)
|
||||||
|
if pgid != "" && pgid != "0" && pgid != "1" {
|
||||||
|
// Kill process group with negative PGID (POSIX convention)
|
||||||
|
_ = exec.Command("kill", "-TERM", "-"+pgid).Run()
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
_ = exec.Command("kill", "-KILL", "-"+pgid).Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also walk the process tree for any descendants that might have called setsid()
|
||||||
descendants := getAllDescendants(pid)
|
descendants := getAllDescendants(pid)
|
||||||
|
|
||||||
// Send SIGTERM to all descendants (deepest first to avoid orphaning)
|
// Send SIGTERM to all descendants (deepest first to avoid orphaning)
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
package tmux
|
package tmux
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func hasTmux() bool {
|
func hasTmux() bool {
|
||||||
@@ -705,6 +708,97 @@ func TestKillSessionWithProcessesExcluding_NonexistentSession(t *testing.T) {
|
|||||||
_ = err
|
_ = err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetProcessGroupID(t *testing.T) {
|
||||||
|
// Test with current process
|
||||||
|
pid := fmt.Sprintf("%d", os.Getpid())
|
||||||
|
pgid := getProcessGroupID(pid)
|
||||||
|
|
||||||
|
if pgid == "" {
|
||||||
|
t.Error("expected non-empty PGID for current process")
|
||||||
|
}
|
||||||
|
|
||||||
|
// PGID should not be 0 or 1 for a normal process
|
||||||
|
if pgid == "0" || pgid == "1" {
|
||||||
|
t.Errorf("unexpected PGID %q for current process", pgid)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test with nonexistent PID
|
||||||
|
pgid = getProcessGroupID("999999999")
|
||||||
|
if pgid != "" {
|
||||||
|
t.Errorf("expected empty PGID for nonexistent process, got %q", pgid)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetProcessGroupMembers(t *testing.T) {
|
||||||
|
// Get current process's PGID
|
||||||
|
pid := fmt.Sprintf("%d", os.Getpid())
|
||||||
|
pgid := getProcessGroupID(pid)
|
||||||
|
if pgid == "" {
|
||||||
|
t.Skip("could not get PGID for current process")
|
||||||
|
}
|
||||||
|
|
||||||
|
members := getProcessGroupMembers(pgid)
|
||||||
|
|
||||||
|
// Current process should be in the list
|
||||||
|
found := false
|
||||||
|
for _, m := range members {
|
||||||
|
if m == pid {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !found {
|
||||||
|
t.Errorf("current process %s not found in process group %s members: %v", pid, pgid, members)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestKillSessionWithProcesses_KillsProcessGroup(t *testing.T) {
|
||||||
|
if !hasTmux() {
|
||||||
|
t.Skip("tmux not installed")
|
||||||
|
}
|
||||||
|
|
||||||
|
tm := NewTmux()
|
||||||
|
sessionName := "gt-test-killpg-" + t.Name()
|
||||||
|
|
||||||
|
// Clean up any existing session
|
||||||
|
_ = tm.KillSession(sessionName)
|
||||||
|
|
||||||
|
// Create session that spawns a child process
|
||||||
|
// The child will stay in the same process group as the shell
|
||||||
|
cmd := `sleep 300 & sleep 300`
|
||||||
|
if err := tm.NewSessionWithCommand(sessionName, "", cmd); err != nil {
|
||||||
|
t.Fatalf("NewSessionWithCommand: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Give processes time to start
|
||||||
|
time.Sleep(200 * time.Millisecond)
|
||||||
|
|
||||||
|
// Verify session exists
|
||||||
|
has, err := tm.HasSession(sessionName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("HasSession: %v", err)
|
||||||
|
}
|
||||||
|
if !has {
|
||||||
|
t.Fatal("expected session to exist after creation")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kill with processes (should kill the entire process group)
|
||||||
|
if err := tm.KillSessionWithProcesses(sessionName); err != nil {
|
||||||
|
t.Fatalf("KillSessionWithProcesses: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify session is gone
|
||||||
|
has, err = tm.HasSession(sessionName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("HasSession after kill: %v", err)
|
||||||
|
}
|
||||||
|
if has {
|
||||||
|
t.Error("expected session to not exist after KillSessionWithProcesses")
|
||||||
|
_ = tm.KillSession(sessionName) // cleanup
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestSessionSet(t *testing.T) {
|
func TestSessionSet(t *testing.T) {
|
||||||
if !hasTmux() {
|
if !hasTmux() {
|
||||||
t.Skip("tmux not installed")
|
t.Skip("tmux not installed")
|
||||||
|
|||||||
Reference in New Issue
Block a user