fix(doctor): add zombie session check to detect dead Claude in tmux
When gt doctor runs, it now detects and kills zombie sessions - tmux sessions that are valid Gas Town sessions (gt-*, hq-*) but have no Claude/node process running inside. These occur when Claude exits or crashes but the tmux session remains. Previously, OrphanSessionCheck only validated session names but did not check if Claude was actually running. This left empty sessions accumulating over time. Fixes #472 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
95cb58e36f
commit
08ef50047d
@@ -135,6 +135,7 @@ func runDoctor(cmd *cobra.Command, args []string) error {
|
||||
d.Register(doctor.NewRoutesCheck())
|
||||
d.Register(doctor.NewRigRoutesJSONLCheck())
|
||||
d.Register(doctor.NewOrphanSessionCheck())
|
||||
d.Register(doctor.NewZombieSessionCheck())
|
||||
d.Register(doctor.NewOrphanProcessCheck())
|
||||
d.Register(doctor.NewWispGCCheck())
|
||||
d.Register(doctor.NewBranchCheck())
|
||||
|
||||
137
internal/doctor/zombie_check.go
Normal file
137
internal/doctor/zombie_check.go
Normal file
@@ -0,0 +1,137 @@
|
||||
package doctor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/steveyegge/gastown/internal/events"
|
||||
"github.com/steveyegge/gastown/internal/tmux"
|
||||
)
|
||||
|
||||
// ZombieSessionCheck detects tmux sessions that are valid Gas Town sessions
|
||||
// but have no Claude/node process running inside (zombies).
|
||||
// These occur when Claude exits or crashes but the tmux session remains.
|
||||
type ZombieSessionCheck struct {
|
||||
FixableCheck
|
||||
zombieSessions []string // Cached during Run for use in Fix
|
||||
}
|
||||
|
||||
// NewZombieSessionCheck creates a new zombie session check.
|
||||
func NewZombieSessionCheck() *ZombieSessionCheck {
|
||||
return &ZombieSessionCheck{
|
||||
FixableCheck: FixableCheck{
|
||||
BaseCheck: BaseCheck{
|
||||
CheckName: "zombie-sessions",
|
||||
CheckDescription: "Detect tmux sessions with dead Claude processes",
|
||||
CheckCategory: CategoryCleanup,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Run checks for zombie Gas Town sessions (tmux alive but Claude dead).
|
||||
func (c *ZombieSessionCheck) Run(ctx *CheckContext) *CheckResult {
|
||||
t := tmux.NewTmux()
|
||||
|
||||
sessions, err := t.ListSessions()
|
||||
if err != nil {
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusWarning,
|
||||
Message: "Could not list tmux sessions",
|
||||
Details: []string{err.Error()},
|
||||
}
|
||||
}
|
||||
|
||||
if len(sessions) == 0 {
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusOK,
|
||||
Message: "No tmux sessions found",
|
||||
}
|
||||
}
|
||||
|
||||
// Check each Gas Town session for zombie status
|
||||
var zombies []string
|
||||
var healthyCount int
|
||||
|
||||
for _, sess := range sessions {
|
||||
if sess == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Only check Gas Town sessions (gt-* and hq-*)
|
||||
if !strings.HasPrefix(sess, "gt-") && !strings.HasPrefix(sess, "hq-") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip crew sessions - they are human-managed and may intentionally
|
||||
// have no Claude running (e.g., between work assignments)
|
||||
if isCrewSession(sess) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if Claude is running in this session
|
||||
if t.IsClaudeRunning(sess) {
|
||||
healthyCount++
|
||||
} else {
|
||||
zombies = append(zombies, sess)
|
||||
}
|
||||
}
|
||||
|
||||
// Cache zombies for Fix
|
||||
c.zombieSessions = zombies
|
||||
|
||||
if len(zombies) == 0 {
|
||||
msg := "No zombie sessions found"
|
||||
if healthyCount > 0 {
|
||||
msg = fmt.Sprintf("All %d Gas Town sessions have running Claude processes", healthyCount)
|
||||
}
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusOK,
|
||||
Message: msg,
|
||||
}
|
||||
}
|
||||
|
||||
details := make([]string, len(zombies))
|
||||
for i, session := range zombies {
|
||||
details[i] = fmt.Sprintf("Zombie: %s (tmux alive, Claude dead)", session)
|
||||
}
|
||||
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusWarning,
|
||||
Message: fmt.Sprintf("Found %d zombie session(s)", len(zombies)),
|
||||
Details: details,
|
||||
FixHint: "Run 'gt doctor --fix' to kill zombie sessions",
|
||||
}
|
||||
}
|
||||
|
||||
// Fix kills all zombie sessions (tmux sessions with no Claude running).
|
||||
// Crew sessions are never auto-killed as they are human-managed.
|
||||
func (c *ZombieSessionCheck) Fix(ctx *CheckContext) error {
|
||||
if len(c.zombieSessions) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
t := tmux.NewTmux()
|
||||
var lastErr error
|
||||
|
||||
for _, sess := range c.zombieSessions {
|
||||
// SAFEGUARD: Never auto-kill crew sessions (double-check)
|
||||
if isCrewSession(sess) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Log pre-death event for audit trail
|
||||
_ = events.LogFeed(events.TypeSessionDeath, sess,
|
||||
events.SessionDeathPayload(sess, "unknown", "zombie cleanup", "gt doctor"))
|
||||
|
||||
if err := t.KillSession(sess); err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
|
||||
return lastErr
|
||||
}
|
||||
74
internal/doctor/zombie_check_test.go
Normal file
74
internal/doctor/zombie_check_test.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package doctor
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNewZombieSessionCheck(t *testing.T) {
|
||||
check := NewZombieSessionCheck()
|
||||
|
||||
if check.Name() != "zombie-sessions" {
|
||||
t.Errorf("expected name 'zombie-sessions', got %q", check.Name())
|
||||
}
|
||||
|
||||
if check.Description() != "Detect tmux sessions with dead Claude processes" {
|
||||
t.Errorf("expected description 'Detect tmux sessions with dead Claude processes', got %q", check.Description())
|
||||
}
|
||||
|
||||
if !check.CanFix() {
|
||||
t.Error("expected CanFix to return true")
|
||||
}
|
||||
|
||||
if check.Category() != CategoryCleanup {
|
||||
t.Errorf("expected category %q, got %q", CategoryCleanup, check.Category())
|
||||
}
|
||||
}
|
||||
|
||||
func TestZombieSessionCheck_Run_NoSessions(t *testing.T) {
|
||||
// This test verifies the check runs without error.
|
||||
// Results depend on the test environment.
|
||||
check := NewZombieSessionCheck()
|
||||
ctx := &CheckContext{TownRoot: t.TempDir()}
|
||||
|
||||
result := check.Run(ctx)
|
||||
|
||||
// Should return OK or Warning depending on environment
|
||||
if result.Status != StatusOK && result.Status != StatusWarning {
|
||||
t.Errorf("expected StatusOK or StatusWarning, got %v: %s", result.Status, result.Message)
|
||||
}
|
||||
}
|
||||
|
||||
func TestZombieSessionCheck_SkipsCrewSessions(t *testing.T) {
|
||||
// Verify that crew sessions are not marked as zombies
|
||||
check := NewZombieSessionCheck()
|
||||
|
||||
// Run the check - crew sessions should be skipped
|
||||
ctx := &CheckContext{TownRoot: t.TempDir()}
|
||||
result := check.Run(ctx)
|
||||
|
||||
// If there are zombies, ensure no crew sessions are in the list
|
||||
for _, detail := range result.Details {
|
||||
if isCrewSession(detail) {
|
||||
t.Errorf("crew session should not be in zombie list: %s", detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestZombieSessionCheck_FixProtectsCrewSessions(t *testing.T) {
|
||||
// Verify that Fix() never kills crew sessions
|
||||
check := NewZombieSessionCheck()
|
||||
|
||||
// Manually set zombies including a crew session (simulating a bug)
|
||||
check.zombieSessions = []string{
|
||||
"gt-gastown-crew-joe", // Should be skipped
|
||||
"gt-gastown-witness", // Would be killed (if real)
|
||||
}
|
||||
|
||||
ctx := &CheckContext{TownRoot: t.TempDir()}
|
||||
|
||||
// Fix should skip crew sessions due to safeguard
|
||||
// (We can't fully test this without mocking tmux, but the safeguard is in place)
|
||||
_ = check.Fix(ctx)
|
||||
|
||||
// The test passes if no panic occurred and crew sessions are protected by the safeguard
|
||||
}
|
||||
Reference in New Issue
Block a user