fix(daemon): Kill zombie tmux sessions before recreating
The daemon was failing to restart agents when zombie tmux sessions existed (session alive but Claude dead). Added EnsureSessionFresh() helper to tmux package that: - Checks if session exists - If exists but Claude not running (zombie), kills the session - Creates fresh session Updated all daemon session creation points to use EnsureSessionFresh: - ensureDeaconRunning() - ensureWitnessRunning() - restartPolecatSession() - restartSession() in lifecycle.go Added tests for the new helper function. (gt-j1i0r) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -277,8 +277,9 @@ func (d *Daemon) ensureDeaconRunning() {
|
||||
d.logger.Println("Deacon not running per agent bead, starting...")
|
||||
|
||||
// Create session in deacon directory (ensures correct CLAUDE.md is loaded)
|
||||
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||
deaconDir := filepath.Join(d.config.TownRoot, "deacon")
|
||||
if err := d.tmux.NewSession(DeaconSessionName, deaconDir); err != nil {
|
||||
if err := d.tmux.EnsureSessionFresh(DeaconSessionName, deaconDir); err != nil {
|
||||
d.logger.Printf("Error creating Deacon session: %v", err)
|
||||
return
|
||||
}
|
||||
@@ -374,8 +375,9 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
||||
d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName)
|
||||
|
||||
// Create session in witness directory
|
||||
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||
witnessDir := filepath.Join(d.config.TownRoot, rigName, "witness")
|
||||
if err := d.tmux.NewSession(sessionName, witnessDir); err != nil {
|
||||
if err := d.tmux.EnsureSessionFresh(sessionName, witnessDir); err != nil {
|
||||
d.logger.Printf("Error creating witness session for %s: %v", rigName, err)
|
||||
return
|
||||
}
|
||||
@@ -664,7 +666,8 @@ func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string)
|
||||
d.syncWorkspace(workDir)
|
||||
|
||||
// Create new tmux session
|
||||
if err := d.tmux.NewSession(sessionName, workDir); err != nil {
|
||||
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||
if err := d.tmux.EnsureSessionFresh(sessionName, workDir); err != nil {
|
||||
return fmt.Errorf("creating session: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -348,7 +348,8 @@ func (d *Daemon) restartSession(sessionName, identity string) error {
|
||||
}
|
||||
|
||||
// Create session
|
||||
if err := d.tmux.NewSession(sessionName, workDir); err != nil {
|
||||
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||
if err := d.tmux.EnsureSessionFresh(sessionName, workDir); err != nil {
|
||||
return fmt.Errorf("creating session: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -76,6 +76,40 @@ func (t *Tmux) NewSession(name, workDir string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// EnsureSessionFresh ensures a session is available and healthy.
|
||||
// If the session exists but is a zombie (Claude not running), it kills the session first.
|
||||
// This prevents "session already exists" errors when trying to restart dead agents.
|
||||
//
|
||||
// A session is considered a zombie if:
|
||||
// - The tmux session exists
|
||||
// - But Claude (node process) is not running in it
|
||||
//
|
||||
// Returns nil if session was created successfully.
|
||||
func (t *Tmux) EnsureSessionFresh(name, workDir string) error {
|
||||
// Check if session already exists
|
||||
exists, err := t.HasSession(name)
|
||||
if err != nil {
|
||||
return fmt.Errorf("checking session: %w", err)
|
||||
}
|
||||
|
||||
if exists {
|
||||
// Session exists - check if it's a zombie
|
||||
if !t.IsClaudeRunning(name) {
|
||||
// Zombie session: tmux alive but Claude dead
|
||||
// Kill it so we can create a fresh one
|
||||
if err := t.KillSession(name); err != nil {
|
||||
return fmt.Errorf("killing zombie session: %w", err)
|
||||
}
|
||||
} else {
|
||||
// Session is healthy (Claude running) - nothing to do
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Create fresh session
|
||||
return t.NewSession(name, workDir)
|
||||
}
|
||||
|
||||
// KillSession terminates a tmux session.
|
||||
func (t *Tmux) KillSession(name string) error {
|
||||
_, err := t.run("kill-session", "-t", name)
|
||||
|
||||
@@ -209,3 +209,98 @@ func TestWrapError(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnsureSessionFresh_NoExistingSession(t *testing.T) {
|
||||
if !hasTmux() {
|
||||
t.Skip("tmux not installed")
|
||||
}
|
||||
|
||||
tm := NewTmux()
|
||||
sessionName := "gt-test-fresh-" + t.Name()
|
||||
|
||||
// Clean up any existing session
|
||||
_ = tm.KillSession(sessionName)
|
||||
|
||||
// EnsureSessionFresh should create a new session
|
||||
if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {
|
||||
t.Fatalf("EnsureSessionFresh: %v", err)
|
||||
}
|
||||
defer func() { _ = tm.KillSession(sessionName) }()
|
||||
|
||||
// Verify session exists
|
||||
has, err := tm.HasSession(sessionName)
|
||||
if err != nil {
|
||||
t.Fatalf("HasSession: %v", err)
|
||||
}
|
||||
if !has {
|
||||
t.Error("expected session to exist after EnsureSessionFresh")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnsureSessionFresh_ZombieSession(t *testing.T) {
|
||||
if !hasTmux() {
|
||||
t.Skip("tmux not installed")
|
||||
}
|
||||
|
||||
tm := NewTmux()
|
||||
sessionName := "gt-test-zombie-" + t.Name()
|
||||
|
||||
// Clean up any existing session
|
||||
_ = tm.KillSession(sessionName)
|
||||
|
||||
// Create a zombie session (session exists but no Claude/node running)
|
||||
// A normal tmux session with bash/zsh is a "zombie" for our purposes
|
||||
if err := tm.NewSession(sessionName, ""); err != nil {
|
||||
t.Fatalf("NewSession: %v", err)
|
||||
}
|
||||
defer func() { _ = tm.KillSession(sessionName) }()
|
||||
|
||||
// Verify it's a zombie (not running Claude/node)
|
||||
if tm.IsClaudeRunning(sessionName) {
|
||||
t.Skip("session unexpectedly has Claude running - can't test zombie case")
|
||||
}
|
||||
|
||||
// EnsureSessionFresh should kill the zombie and create fresh session
|
||||
// This should NOT error with "session already exists"
|
||||
if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {
|
||||
t.Fatalf("EnsureSessionFresh on zombie: %v", err)
|
||||
}
|
||||
|
||||
// Session should still exist
|
||||
has, err := tm.HasSession(sessionName)
|
||||
if err != nil {
|
||||
t.Fatalf("HasSession: %v", err)
|
||||
}
|
||||
if !has {
|
||||
t.Error("expected session to exist after EnsureSessionFresh on zombie")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnsureSessionFresh_IdempotentOnZombie(t *testing.T) {
|
||||
if !hasTmux() {
|
||||
t.Skip("tmux not installed")
|
||||
}
|
||||
|
||||
tm := NewTmux()
|
||||
sessionName := "gt-test-idem-" + t.Name()
|
||||
|
||||
// Clean up any existing session
|
||||
_ = tm.KillSession(sessionName)
|
||||
|
||||
// Call EnsureSessionFresh multiple times - should work each time
|
||||
for i := 0; i < 3; i++ {
|
||||
if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {
|
||||
t.Fatalf("EnsureSessionFresh attempt %d: %v", i+1, err)
|
||||
}
|
||||
}
|
||||
defer func() { _ = tm.KillSession(sessionName) }()
|
||||
|
||||
// Session should exist
|
||||
has, err := tm.HasSession(sessionName)
|
||||
if err != nil {
|
||||
t.Fatalf("HasSession: %v", err)
|
||||
}
|
||||
if !has {
|
||||
t.Error("expected session to exist after multiple EnsureSessionFresh calls")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user