diff --git a/internal/cmd/doctor.go b/internal/cmd/doctor.go index a28c3fcd..bc269ff4 100644 --- a/internal/cmd/doctor.go +++ b/internal/cmd/doctor.go @@ -135,6 +135,7 @@ func runDoctor(cmd *cobra.Command, args []string) error { d.Register(doctor.NewRoutesCheck()) d.Register(doctor.NewRigRoutesJSONLCheck()) d.Register(doctor.NewOrphanSessionCheck()) + d.Register(doctor.NewZombieSessionCheck()) d.Register(doctor.NewOrphanProcessCheck()) d.Register(doctor.NewWispGCCheck()) d.Register(doctor.NewBranchCheck()) diff --git a/internal/doctor/zombie_check.go b/internal/doctor/zombie_check.go new file mode 100644 index 00000000..e1444d73 --- /dev/null +++ b/internal/doctor/zombie_check.go @@ -0,0 +1,137 @@ +package doctor + +import ( + "fmt" + "strings" + + "github.com/steveyegge/gastown/internal/events" + "github.com/steveyegge/gastown/internal/tmux" +) + +// ZombieSessionCheck detects tmux sessions that are valid Gas Town sessions +// but have no Claude/node process running inside (zombies). +// These occur when Claude exits or crashes but the tmux session remains. +type ZombieSessionCheck struct { + FixableCheck + zombieSessions []string // Cached during Run for use in Fix +} + +// NewZombieSessionCheck creates a new zombie session check. +func NewZombieSessionCheck() *ZombieSessionCheck { + return &ZombieSessionCheck{ + FixableCheck: FixableCheck{ + BaseCheck: BaseCheck{ + CheckName: "zombie-sessions", + CheckDescription: "Detect tmux sessions with dead Claude processes", + CheckCategory: CategoryCleanup, + }, + }, + } +} + +// Run checks for zombie Gas Town sessions (tmux alive but Claude dead). +func (c *ZombieSessionCheck) Run(ctx *CheckContext) *CheckResult { + t := tmux.NewTmux() + + sessions, err := t.ListSessions() + if err != nil { + return &CheckResult{ + Name: c.Name(), + Status: StatusWarning, + Message: "Could not list tmux sessions", + Details: []string{err.Error()}, + } + } + + if len(sessions) == 0 { + return &CheckResult{ + Name: c.Name(), + Status: StatusOK, + Message: "No tmux sessions found", + } + } + + // Check each Gas Town session for zombie status + var zombies []string + var healthyCount int + + for _, sess := range sessions { + if sess == "" { + continue + } + + // Only check Gas Town sessions (gt-* and hq-*) + if !strings.HasPrefix(sess, "gt-") && !strings.HasPrefix(sess, "hq-") { + continue + } + + // Skip crew sessions - they are human-managed and may intentionally + // have no Claude running (e.g., between work assignments) + if isCrewSession(sess) { + continue + } + + // Check if Claude is running in this session + if t.IsClaudeRunning(sess) { + healthyCount++ + } else { + zombies = append(zombies, sess) + } + } + + // Cache zombies for Fix + c.zombieSessions = zombies + + if len(zombies) == 0 { + msg := "No zombie sessions found" + if healthyCount > 0 { + msg = fmt.Sprintf("All %d Gas Town sessions have running Claude processes", healthyCount) + } + return &CheckResult{ + Name: c.Name(), + Status: StatusOK, + Message: msg, + } + } + + details := make([]string, len(zombies)) + for i, session := range zombies { + details[i] = fmt.Sprintf("Zombie: %s (tmux alive, Claude dead)", session) + } + + return &CheckResult{ + Name: c.Name(), + Status: StatusWarning, + Message: fmt.Sprintf("Found %d zombie session(s)", len(zombies)), + Details: details, + FixHint: "Run 'gt doctor --fix' to kill zombie sessions", + } +} + +// Fix kills all zombie sessions (tmux sessions with no Claude running). +// Crew sessions are never auto-killed as they are human-managed. +func (c *ZombieSessionCheck) Fix(ctx *CheckContext) error { + if len(c.zombieSessions) == 0 { + return nil + } + + t := tmux.NewTmux() + var lastErr error + + for _, sess := range c.zombieSessions { + // SAFEGUARD: Never auto-kill crew sessions (double-check) + if isCrewSession(sess) { + continue + } + + // Log pre-death event for audit trail + _ = events.LogFeed(events.TypeSessionDeath, sess, + events.SessionDeathPayload(sess, "unknown", "zombie cleanup", "gt doctor")) + + if err := t.KillSession(sess); err != nil { + lastErr = err + } + } + + return lastErr +} diff --git a/internal/doctor/zombie_check_test.go b/internal/doctor/zombie_check_test.go new file mode 100644 index 00000000..03a88e74 --- /dev/null +++ b/internal/doctor/zombie_check_test.go @@ -0,0 +1,74 @@ +package doctor + +import ( + "testing" +) + +func TestNewZombieSessionCheck(t *testing.T) { + check := NewZombieSessionCheck() + + if check.Name() != "zombie-sessions" { + t.Errorf("expected name 'zombie-sessions', got %q", check.Name()) + } + + if check.Description() != "Detect tmux sessions with dead Claude processes" { + t.Errorf("expected description 'Detect tmux sessions with dead Claude processes', got %q", check.Description()) + } + + if !check.CanFix() { + t.Error("expected CanFix to return true") + } + + if check.Category() != CategoryCleanup { + t.Errorf("expected category %q, got %q", CategoryCleanup, check.Category()) + } +} + +func TestZombieSessionCheck_Run_NoSessions(t *testing.T) { + // This test verifies the check runs without error. + // Results depend on the test environment. + check := NewZombieSessionCheck() + ctx := &CheckContext{TownRoot: t.TempDir()} + + result := check.Run(ctx) + + // Should return OK or Warning depending on environment + if result.Status != StatusOK && result.Status != StatusWarning { + t.Errorf("expected StatusOK or StatusWarning, got %v: %s", result.Status, result.Message) + } +} + +func TestZombieSessionCheck_SkipsCrewSessions(t *testing.T) { + // Verify that crew sessions are not marked as zombies + check := NewZombieSessionCheck() + + // Run the check - crew sessions should be skipped + ctx := &CheckContext{TownRoot: t.TempDir()} + result := check.Run(ctx) + + // If there are zombies, ensure no crew sessions are in the list + for _, detail := range result.Details { + if isCrewSession(detail) { + t.Errorf("crew session should not be in zombie list: %s", detail) + } + } +} + +func TestZombieSessionCheck_FixProtectsCrewSessions(t *testing.T) { + // Verify that Fix() never kills crew sessions + check := NewZombieSessionCheck() + + // Manually set zombies including a crew session (simulating a bug) + check.zombieSessions = []string{ + "gt-gastown-crew-joe", // Should be skipped + "gt-gastown-witness", // Would be killed (if real) + } + + ctx := &CheckContext{TownRoot: t.TempDir()} + + // Fix should skip crew sessions due to safeguard + // (We can't fully test this without mocking tmux, but the safeguard is in place) + _ = check.Fix(ctx) + + // The test passes if no panic occurred and crew sessions are protected by the safeguard +}