fix(daemon): Kill zombie tmux sessions before recreating
The daemon was failing to restart agents when zombie tmux sessions existed (session alive but Claude dead). Added EnsureSessionFresh() helper to tmux package that: - Checks if session exists - If exists but Claude not running (zombie), kills the session - Creates fresh session Updated all daemon session creation points to use EnsureSessionFresh: - ensureDeaconRunning() - ensureWitnessRunning() - restartPolecatSession() - restartSession() in lifecycle.go Added tests for the new helper function. (gt-j1i0r) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -277,8 +277,9 @@ func (d *Daemon) ensureDeaconRunning() {
|
|||||||
d.logger.Println("Deacon not running per agent bead, starting...")
|
d.logger.Println("Deacon not running per agent bead, starting...")
|
||||||
|
|
||||||
// Create session in deacon directory (ensures correct CLAUDE.md is loaded)
|
// Create session in deacon directory (ensures correct CLAUDE.md is loaded)
|
||||||
|
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||||
deaconDir := filepath.Join(d.config.TownRoot, "deacon")
|
deaconDir := filepath.Join(d.config.TownRoot, "deacon")
|
||||||
if err := d.tmux.NewSession(DeaconSessionName, deaconDir); err != nil {
|
if err := d.tmux.EnsureSessionFresh(DeaconSessionName, deaconDir); err != nil {
|
||||||
d.logger.Printf("Error creating Deacon session: %v", err)
|
d.logger.Printf("Error creating Deacon session: %v", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -374,8 +375,9 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
|
|||||||
d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName)
|
d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName)
|
||||||
|
|
||||||
// Create session in witness directory
|
// Create session in witness directory
|
||||||
|
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||||
witnessDir := filepath.Join(d.config.TownRoot, rigName, "witness")
|
witnessDir := filepath.Join(d.config.TownRoot, rigName, "witness")
|
||||||
if err := d.tmux.NewSession(sessionName, witnessDir); err != nil {
|
if err := d.tmux.EnsureSessionFresh(sessionName, witnessDir); err != nil {
|
||||||
d.logger.Printf("Error creating witness session for %s: %v", rigName, err)
|
d.logger.Printf("Error creating witness session for %s: %v", rigName, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -664,7 +666,8 @@ func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string)
|
|||||||
d.syncWorkspace(workDir)
|
d.syncWorkspace(workDir)
|
||||||
|
|
||||||
// Create new tmux session
|
// Create new tmux session
|
||||||
if err := d.tmux.NewSession(sessionName, workDir); err != nil {
|
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||||
|
if err := d.tmux.EnsureSessionFresh(sessionName, workDir); err != nil {
|
||||||
return fmt.Errorf("creating session: %w", err)
|
return fmt.Errorf("creating session: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -348,7 +348,8 @@ func (d *Daemon) restartSession(sessionName, identity string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create session
|
// Create session
|
||||||
if err := d.tmux.NewSession(sessionName, workDir); err != nil {
|
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
|
||||||
|
if err := d.tmux.EnsureSessionFresh(sessionName, workDir); err != nil {
|
||||||
return fmt.Errorf("creating session: %w", err)
|
return fmt.Errorf("creating session: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -76,6 +76,40 @@ func (t *Tmux) NewSession(name, workDir string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnsureSessionFresh ensures a session is available and healthy.
|
||||||
|
// If the session exists but is a zombie (Claude not running), it kills the session first.
|
||||||
|
// This prevents "session already exists" errors when trying to restart dead agents.
|
||||||
|
//
|
||||||
|
// A session is considered a zombie if:
|
||||||
|
// - The tmux session exists
|
||||||
|
// - But Claude (node process) is not running in it
|
||||||
|
//
|
||||||
|
// Returns nil if session was created successfully.
|
||||||
|
func (t *Tmux) EnsureSessionFresh(name, workDir string) error {
|
||||||
|
// Check if session already exists
|
||||||
|
exists, err := t.HasSession(name)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("checking session: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if exists {
|
||||||
|
// Session exists - check if it's a zombie
|
||||||
|
if !t.IsClaudeRunning(name) {
|
||||||
|
// Zombie session: tmux alive but Claude dead
|
||||||
|
// Kill it so we can create a fresh one
|
||||||
|
if err := t.KillSession(name); err != nil {
|
||||||
|
return fmt.Errorf("killing zombie session: %w", err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Session is healthy (Claude running) - nothing to do
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create fresh session
|
||||||
|
return t.NewSession(name, workDir)
|
||||||
|
}
|
||||||
|
|
||||||
// KillSession terminates a tmux session.
|
// KillSession terminates a tmux session.
|
||||||
func (t *Tmux) KillSession(name string) error {
|
func (t *Tmux) KillSession(name string) error {
|
||||||
_, err := t.run("kill-session", "-t", name)
|
_, err := t.run("kill-session", "-t", name)
|
||||||
|
|||||||
@@ -209,3 +209,98 @@ func TestWrapError(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEnsureSessionFresh_NoExistingSession(t *testing.T) {
|
||||||
|
if !hasTmux() {
|
||||||
|
t.Skip("tmux not installed")
|
||||||
|
}
|
||||||
|
|
||||||
|
tm := NewTmux()
|
||||||
|
sessionName := "gt-test-fresh-" + t.Name()
|
||||||
|
|
||||||
|
// Clean up any existing session
|
||||||
|
_ = tm.KillSession(sessionName)
|
||||||
|
|
||||||
|
// EnsureSessionFresh should create a new session
|
||||||
|
if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {
|
||||||
|
t.Fatalf("EnsureSessionFresh: %v", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = tm.KillSession(sessionName) }()
|
||||||
|
|
||||||
|
// Verify session exists
|
||||||
|
has, err := tm.HasSession(sessionName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("HasSession: %v", err)
|
||||||
|
}
|
||||||
|
if !has {
|
||||||
|
t.Error("expected session to exist after EnsureSessionFresh")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnsureSessionFresh_ZombieSession(t *testing.T) {
|
||||||
|
if !hasTmux() {
|
||||||
|
t.Skip("tmux not installed")
|
||||||
|
}
|
||||||
|
|
||||||
|
tm := NewTmux()
|
||||||
|
sessionName := "gt-test-zombie-" + t.Name()
|
||||||
|
|
||||||
|
// Clean up any existing session
|
||||||
|
_ = tm.KillSession(sessionName)
|
||||||
|
|
||||||
|
// Create a zombie session (session exists but no Claude/node running)
|
||||||
|
// A normal tmux session with bash/zsh is a "zombie" for our purposes
|
||||||
|
if err := tm.NewSession(sessionName, ""); err != nil {
|
||||||
|
t.Fatalf("NewSession: %v", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = tm.KillSession(sessionName) }()
|
||||||
|
|
||||||
|
// Verify it's a zombie (not running Claude/node)
|
||||||
|
if tm.IsClaudeRunning(sessionName) {
|
||||||
|
t.Skip("session unexpectedly has Claude running - can't test zombie case")
|
||||||
|
}
|
||||||
|
|
||||||
|
// EnsureSessionFresh should kill the zombie and create fresh session
|
||||||
|
// This should NOT error with "session already exists"
|
||||||
|
if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {
|
||||||
|
t.Fatalf("EnsureSessionFresh on zombie: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Session should still exist
|
||||||
|
has, err := tm.HasSession(sessionName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("HasSession: %v", err)
|
||||||
|
}
|
||||||
|
if !has {
|
||||||
|
t.Error("expected session to exist after EnsureSessionFresh on zombie")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnsureSessionFresh_IdempotentOnZombie(t *testing.T) {
|
||||||
|
if !hasTmux() {
|
||||||
|
t.Skip("tmux not installed")
|
||||||
|
}
|
||||||
|
|
||||||
|
tm := NewTmux()
|
||||||
|
sessionName := "gt-test-idem-" + t.Name()
|
||||||
|
|
||||||
|
// Clean up any existing session
|
||||||
|
_ = tm.KillSession(sessionName)
|
||||||
|
|
||||||
|
// Call EnsureSessionFresh multiple times - should work each time
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {
|
||||||
|
t.Fatalf("EnsureSessionFresh attempt %d: %v", i+1, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
defer func() { _ = tm.KillSession(sessionName) }()
|
||||||
|
|
||||||
|
// Session should exist
|
||||||
|
has, err := tm.HasSession(sessionName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("HasSession: %v", err)
|
||||||
|
}
|
||||||
|
if !has {
|
||||||
|
t.Error("expected session to exist after multiple EnsureSessionFresh calls")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user