fix(daemon): Kill zombie tmux sessions before recreating

The daemon was failing to restart agents when zombie tmux sessions existed
(session alive but Claude dead). Added EnsureSessionFresh() helper to
tmux package that:
- Checks if session exists
- If exists but Claude not running (zombie), kills the session
- Creates fresh session

Updated all daemon session creation points to use EnsureSessionFresh:
- ensureDeaconRunning()
- ensureWitnessRunning()
- restartPolecatSession()
- restartSession() in lifecycle.go

Added tests for the new helper function. (gt-j1i0r)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
dag
2026-01-02 18:52:06 -08:00
committed by Steve Yegge
parent 883997e044
commit 9ad826cd8c
4 changed files with 137 additions and 4 deletions

View File

@@ -277,8 +277,9 @@ func (d *Daemon) ensureDeaconRunning() {
d.logger.Println("Deacon not running per agent bead, starting...")
// Create session in deacon directory (ensures correct CLAUDE.md is loaded)
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
deaconDir := filepath.Join(d.config.TownRoot, "deacon")
if err := d.tmux.NewSession(DeaconSessionName, deaconDir); err != nil {
if err := d.tmux.EnsureSessionFresh(DeaconSessionName, deaconDir); err != nil {
d.logger.Printf("Error creating Deacon session: %v", err)
return
}
@@ -374,8 +375,9 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName)
// Create session in witness directory
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
witnessDir := filepath.Join(d.config.TownRoot, rigName, "witness")
if err := d.tmux.NewSession(sessionName, witnessDir); err != nil {
if err := d.tmux.EnsureSessionFresh(sessionName, witnessDir); err != nil {
d.logger.Printf("Error creating witness session for %s: %v", rigName, err)
return
}
@@ -664,7 +666,8 @@ func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string)
d.syncWorkspace(workDir)
// Create new tmux session
if err := d.tmux.NewSession(sessionName, workDir); err != nil {
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
if err := d.tmux.EnsureSessionFresh(sessionName, workDir); err != nil {
return fmt.Errorf("creating session: %w", err)
}

View File

@@ -348,7 +348,8 @@ func (d *Daemon) restartSession(sessionName, identity string) error {
}
// Create session
if err := d.tmux.NewSession(sessionName, workDir); err != nil {
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
if err := d.tmux.EnsureSessionFresh(sessionName, workDir); err != nil {
return fmt.Errorf("creating session: %w", err)
}

View File

@@ -76,6 +76,40 @@ func (t *Tmux) NewSession(name, workDir string) error {
return err
}
// EnsureSessionFresh ensures a session is available and healthy.
// If the session exists but is a zombie (Claude not running), it kills the session first.
// This prevents "session already exists" errors when trying to restart dead agents.
//
// A session is considered a zombie if:
// - The tmux session exists
// - But Claude (node process) is not running in it
//
// Returns nil if session was created successfully.
func (t *Tmux) EnsureSessionFresh(name, workDir string) error {
// Check if session already exists
exists, err := t.HasSession(name)
if err != nil {
return fmt.Errorf("checking session: %w", err)
}
if exists {
// Session exists - check if it's a zombie
if !t.IsClaudeRunning(name) {
// Zombie session: tmux alive but Claude dead
// Kill it so we can create a fresh one
if err := t.KillSession(name); err != nil {
return fmt.Errorf("killing zombie session: %w", err)
}
} else {
// Session is healthy (Claude running) - nothing to do
return nil
}
}
// Create fresh session
return t.NewSession(name, workDir)
}
// KillSession terminates a tmux session.
func (t *Tmux) KillSession(name string) error {
_, err := t.run("kill-session", "-t", name)

View File

@@ -209,3 +209,98 @@ func TestWrapError(t *testing.T) {
}
}
}
func TestEnsureSessionFresh_NoExistingSession(t *testing.T) {
if !hasTmux() {
t.Skip("tmux not installed")
}
tm := NewTmux()
sessionName := "gt-test-fresh-" + t.Name()
// Clean up any existing session
_ = tm.KillSession(sessionName)
// EnsureSessionFresh should create a new session
if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {
t.Fatalf("EnsureSessionFresh: %v", err)
}
defer func() { _ = tm.KillSession(sessionName) }()
// Verify session exists
has, err := tm.HasSession(sessionName)
if err != nil {
t.Fatalf("HasSession: %v", err)
}
if !has {
t.Error("expected session to exist after EnsureSessionFresh")
}
}
func TestEnsureSessionFresh_ZombieSession(t *testing.T) {
if !hasTmux() {
t.Skip("tmux not installed")
}
tm := NewTmux()
sessionName := "gt-test-zombie-" + t.Name()
// Clean up any existing session
_ = tm.KillSession(sessionName)
// Create a zombie session (session exists but no Claude/node running)
// A normal tmux session with bash/zsh is a "zombie" for our purposes
if err := tm.NewSession(sessionName, ""); err != nil {
t.Fatalf("NewSession: %v", err)
}
defer func() { _ = tm.KillSession(sessionName) }()
// Verify it's a zombie (not running Claude/node)
if tm.IsClaudeRunning(sessionName) {
t.Skip("session unexpectedly has Claude running - can't test zombie case")
}
// EnsureSessionFresh should kill the zombie and create fresh session
// This should NOT error with "session already exists"
if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {
t.Fatalf("EnsureSessionFresh on zombie: %v", err)
}
// Session should still exist
has, err := tm.HasSession(sessionName)
if err != nil {
t.Fatalf("HasSession: %v", err)
}
if !has {
t.Error("expected session to exist after EnsureSessionFresh on zombie")
}
}
func TestEnsureSessionFresh_IdempotentOnZombie(t *testing.T) {
if !hasTmux() {
t.Skip("tmux not installed")
}
tm := NewTmux()
sessionName := "gt-test-idem-" + t.Name()
// Clean up any existing session
_ = tm.KillSession(sessionName)
// Call EnsureSessionFresh multiple times - should work each time
for i := 0; i < 3; i++ {
if err := tm.EnsureSessionFresh(sessionName, ""); err != nil {
t.Fatalf("EnsureSessionFresh attempt %d: %v", i+1, err)
}
}
defer func() { _ = tm.KillSession(sessionName) }()
// Session should exist
has, err := tm.HasSession(sessionName)
if err != nil {
t.Fatalf("HasSession: %v", err)
}
if !has {
t.Error("expected session to exist after multiple EnsureSessionFresh calls")
}
}