From 9ad826cd8cacd913d5109074cbfe36f23b794315 Mon Sep 17 00:00:00 2001 From: dag Date: Fri, 2 Jan 2026 18:52:06 -0800 Subject: [PATCH] fix(daemon): Kill zombie tmux sessions before recreating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The daemon was failing to restart agents when zombie tmux sessions existed (session alive but Claude dead). Added EnsureSessionFresh() helper to tmux package that: - Checks if session exists - If exists but Claude not running (zombie), kills the session - Creates fresh session Updated all daemon session creation points to use EnsureSessionFresh: - ensureDeaconRunning() - ensureWitnessRunning() - restartPolecatSession() - restartSession() in lifecycle.go Added tests for the new helper function. (gt-j1i0r) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal/daemon/daemon.go | 9 ++-- internal/daemon/lifecycle.go | 3 +- internal/tmux/tmux.go | 34 +++++++++++++ internal/tmux/tmux_test.go | 95 ++++++++++++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 4 deletions(-) diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 49630163..439a831b 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -277,8 +277,9 @@ func (d *Daemon) ensureDeaconRunning() { d.logger.Println("Deacon not running per agent bead, starting...") // Create session in deacon directory (ensures correct CLAUDE.md is loaded) + // Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude deaconDir := filepath.Join(d.config.TownRoot, "deacon") - if err := d.tmux.NewSession(DeaconSessionName, deaconDir); err != nil { + if err := d.tmux.EnsureSessionFresh(DeaconSessionName, deaconDir); err != nil { d.logger.Printf("Error creating Deacon session: %v", err) return } @@ -374,8 +375,9 @@ func (d *Daemon) ensureWitnessRunning(rigName string) { d.logger.Printf("Witness for %s not running per agent bead, starting...", rigName) // Create session in witness directory + // Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude witnessDir := filepath.Join(d.config.TownRoot, rigName, "witness") - if err := d.tmux.NewSession(sessionName, witnessDir); err != nil { + if err := d.tmux.EnsureSessionFresh(sessionName, witnessDir); err != nil { d.logger.Printf("Error creating witness session for %s: %v", rigName, err) return } @@ -664,7 +666,8 @@ func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string) d.syncWorkspace(workDir) // Create new tmux session - if err := d.tmux.NewSession(sessionName, workDir); err != nil { + // Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude + if err := d.tmux.EnsureSessionFresh(sessionName, workDir); err != nil { return fmt.Errorf("creating session: %w", err) } diff --git a/internal/daemon/lifecycle.go b/internal/daemon/lifecycle.go index 7f5e7bc2..760ff2ad 100644 --- a/internal/daemon/lifecycle.go +++ b/internal/daemon/lifecycle.go @@ -348,7 +348,8 @@ func (d *Daemon) restartSession(sessionName, identity string) error { } // Create session - if err := d.tmux.NewSession(sessionName, workDir); err != nil { + // Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude + if err := d.tmux.EnsureSessionFresh(sessionName, workDir); err != nil { return fmt.Errorf("creating session: %w", err) } diff --git a/internal/tmux/tmux.go b/internal/tmux/tmux.go index 2b1a0071..c48055d1 100644 --- a/internal/tmux/tmux.go +++ b/internal/tmux/tmux.go @@ -76,6 +76,40 @@ func (t *Tmux) NewSession(name, workDir string) error { return err } +// EnsureSessionFresh ensures a session is available and healthy. +// If the session exists but is a zombie (Claude not running), it kills the session first. +// This prevents "session already exists" errors when trying to restart dead agents. +// +// A session is considered a zombie if: +// - The tmux session exists +// - But Claude (node process) is not running in it +// +// Returns nil if session was created successfully. +func (t *Tmux) EnsureSessionFresh(name, workDir string) error { + // Check if session already exists + exists, err := t.HasSession(name) + if err != nil { + return fmt.Errorf("checking session: %w", err) + } + + if exists { + // Session exists - check if it's a zombie + if !t.IsClaudeRunning(name) { + // Zombie session: tmux alive but Claude dead + // Kill it so we can create a fresh one + if err := t.KillSession(name); err != nil { + return fmt.Errorf("killing zombie session: %w", err) + } + } else { + // Session is healthy (Claude running) - nothing to do + return nil + } + } + + // Create fresh session + return t.NewSession(name, workDir) +} + // KillSession terminates a tmux session. func (t *Tmux) KillSession(name string) error { _, err := t.run("kill-session", "-t", name) diff --git a/internal/tmux/tmux_test.go b/internal/tmux/tmux_test.go index 0116bf53..8aeee7f5 100644 --- a/internal/tmux/tmux_test.go +++ b/internal/tmux/tmux_test.go @@ -209,3 +209,98 @@ func TestWrapError(t *testing.T) { } } } + +func TestEnsureSessionFresh_NoExistingSession(t *testing.T) { + if !hasTmux() { + t.Skip("tmux not installed") + } + + tm := NewTmux() + sessionName := "gt-test-fresh-" + t.Name() + + // Clean up any existing session + _ = tm.KillSession(sessionName) + + // EnsureSessionFresh should create a new session + if err := tm.EnsureSessionFresh(sessionName, ""); err != nil { + t.Fatalf("EnsureSessionFresh: %v", err) + } + defer func() { _ = tm.KillSession(sessionName) }() + + // Verify session exists + has, err := tm.HasSession(sessionName) + if err != nil { + t.Fatalf("HasSession: %v", err) + } + if !has { + t.Error("expected session to exist after EnsureSessionFresh") + } +} + +func TestEnsureSessionFresh_ZombieSession(t *testing.T) { + if !hasTmux() { + t.Skip("tmux not installed") + } + + tm := NewTmux() + sessionName := "gt-test-zombie-" + t.Name() + + // Clean up any existing session + _ = tm.KillSession(sessionName) + + // Create a zombie session (session exists but no Claude/node running) + // A normal tmux session with bash/zsh is a "zombie" for our purposes + if err := tm.NewSession(sessionName, ""); err != nil { + t.Fatalf("NewSession: %v", err) + } + defer func() { _ = tm.KillSession(sessionName) }() + + // Verify it's a zombie (not running Claude/node) + if tm.IsClaudeRunning(sessionName) { + t.Skip("session unexpectedly has Claude running - can't test zombie case") + } + + // EnsureSessionFresh should kill the zombie and create fresh session + // This should NOT error with "session already exists" + if err := tm.EnsureSessionFresh(sessionName, ""); err != nil { + t.Fatalf("EnsureSessionFresh on zombie: %v", err) + } + + // Session should still exist + has, err := tm.HasSession(sessionName) + if err != nil { + t.Fatalf("HasSession: %v", err) + } + if !has { + t.Error("expected session to exist after EnsureSessionFresh on zombie") + } +} + +func TestEnsureSessionFresh_IdempotentOnZombie(t *testing.T) { + if !hasTmux() { + t.Skip("tmux not installed") + } + + tm := NewTmux() + sessionName := "gt-test-idem-" + t.Name() + + // Clean up any existing session + _ = tm.KillSession(sessionName) + + // Call EnsureSessionFresh multiple times - should work each time + for i := 0; i < 3; i++ { + if err := tm.EnsureSessionFresh(sessionName, ""); err != nil { + t.Fatalf("EnsureSessionFresh attempt %d: %v", i+1, err) + } + } + defer func() { _ = tm.KillSession(sessionName) }() + + // Session should exist + has, err := tm.HasSession(sessionName) + if err != nil { + t.Fatalf("HasSession: %v", err) + } + if !has { + t.Error("expected session to exist after multiple EnsureSessionFresh calls") + } +}