From 85ec39c487a223f3cd9d610a1ad83e9444058a3c Mon Sep 17 00:00:00 2001 From: Steve Yegge Date: Mon, 29 Dec 2025 22:07:45 -0800 Subject: [PATCH] fix: Add session health monitoring and auto-restart for crashed polecats (gt-i7wcn) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fix addresses the issue where polecat sessions terminate unexpectedly during work without recovery: Changes: - Add `checkPolecatSessionHealth()` to daemon heartbeat loop - Proactively validates tmux sessions are alive for polecats - Detects crashed polecats that have work-on-hook - Auto-restarts crashed polecats with proper environment setup - Notifies Witness if restart fails as fallback - Add polecat support to lifecycle identity mapping - `identityToSession()` now handles polecat identities - `restartSession()` can restart crashed polecat sessions - `identityToStateFile()` handles polecat state files - `identityToAgentBeadID()` handles polecat agent beads - `identityToBDActor()` handles polecat BD_ACTOR conversion - Add `gt session check` command for manual health checking - Validates tmux sessions exist for all polecats - Shows summary of healthy vs not-running sessions - Useful for debugging session issues This provides faster recovery (within heartbeat interval) compared to waiting for GUPP violation timeout (30 min) or Witness detection. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal/cmd/session.go | 108 +++++++++++++++++++++++++ internal/daemon/daemon.go | 152 +++++++++++++++++++++++++++++++++++ internal/daemon/lifecycle.go | 85 ++++++++++++++++++++ 3 files changed, 345 insertions(+) diff --git a/internal/cmd/session.go b/internal/cmd/session.go index 9754e12f..29878c3f 100644 --- a/internal/cmd/session.go +++ b/internal/cmd/session.go @@ -147,6 +147,24 @@ Displays running state, uptime, session info, and activity.`, RunE: runSessionStatus, } +var sessionCheckCmd = &cobra.Command{ + Use: "check [rig]", + Short: "Check session health for polecats", + Long: `Check if polecat tmux sessions are alive and healthy. + +This command validates that: +1. Polecats with work-on-hook have running tmux sessions +2. Sessions are responsive + +Use this for manual health checks or debugging session issues. + +Examples: + gt session check # Check all rigs + gt session check gastown # Check specific rig`, + Args: cobra.MaximumNArgs(1), + RunE: runSessionCheck, +} + func init() { // Start flags sessionStartCmd.Flags().StringVar(&sessionIssue, "issue", "", "Issue ID to work on") @@ -177,6 +195,7 @@ func init() { sessionCmd.AddCommand(sessionInjectCmd) sessionCmd.AddCommand(sessionRestartCmd) sessionCmd.AddCommand(sessionStatusCmd) + sessionCmd.AddCommand(sessionCheckCmd) rootCmd.AddCommand(sessionCmd) } @@ -573,3 +592,92 @@ func formatDuration(d time.Duration) string { } return fmt.Sprintf("%dh %dm", hours, mins) } + +func runSessionCheck(cmd *cobra.Command, args []string) error { + // Find town root + townRoot, err := workspace.FindFromCwdOrError() + if err != nil { + return fmt.Errorf("not in a Gas Town workspace: %w", err) + } + + // Load rigs config + rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json") + rigsConfig, err := config.LoadRigsConfig(rigsConfigPath) + if err != nil { + rigsConfig = &config.RigsConfig{Rigs: make(map[string]config.RigEntry)} + } + + // Get rigs to check + g := git.NewGit(townRoot) + rigMgr := rig.NewManager(townRoot, rigsConfig, g) + rigs, err := rigMgr.DiscoverRigs() + if err != nil { + return fmt.Errorf("discovering rigs: %w", err) + } + + // Filter if specific rig requested + if len(args) > 0 { + rigFilter := args[0] + var filtered []*rig.Rig + for _, r := range rigs { + if r.Name == rigFilter { + filtered = append(filtered, r) + } + } + if len(filtered) == 0 { + return fmt.Errorf("rig not found: %s", rigFilter) + } + rigs = filtered + } + + fmt.Printf("%s Session Health Check\n\n", style.Bold.Render("🔍")) + + t := tmux.NewTmux() + totalChecked := 0 + totalHealthy := 0 + totalCrashed := 0 + + for _, r := range rigs { + polecatsDir := filepath.Join(r.Path, "polecats") + entries, err := os.ReadDir(polecatsDir) + if err != nil { + continue // Rig might not have polecats + } + + for _, entry := range entries { + if !entry.IsDir() { + continue + } + polecatName := entry.Name() + sessionName := fmt.Sprintf("gt-%s-%s", r.Name, polecatName) + totalChecked++ + + // Check if session exists + running, err := t.HasSession(sessionName) + if err != nil { + fmt.Printf(" %s %s/%s: %s\n", style.Bold.Render("⚠"), r.Name, polecatName, style.Dim.Render("error checking session")) + continue + } + + if running { + fmt.Printf(" %s %s/%s: %s\n", style.Bold.Render("✓"), r.Name, polecatName, style.Dim.Render("session alive")) + totalHealthy++ + } else { + // Check if polecat has work on hook (would need restart) + fmt.Printf(" %s %s/%s: %s\n", style.Bold.Render("✗"), r.Name, polecatName, style.Dim.Render("session not running")) + totalCrashed++ + } + } + } + + // Summary + fmt.Printf("\n%s Summary: %d checked, %d healthy, %d not running\n", + style.Bold.Render("📊"), totalChecked, totalHealthy, totalCrashed) + + if totalCrashed > 0 { + fmt.Printf("\n%s To restart crashed polecats: gt session restart /\n", + style.Dim.Render("Tip:")) + } + + return nil +} diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 9d03f61c..46038def 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -6,6 +6,7 @@ import ( "fmt" "log" "os" + "os/exec" "os/signal" "path/filepath" "strconv" @@ -202,6 +203,10 @@ func (d *Daemon) heartbeat(state *State) { // 7. Check for orphaned work (assigned to dead agents) d.checkOrphanedWork() + // 8. Check polecat session health (proactive crash detection) + // This validates tmux sessions are still alive for polecats with work-on-hook + d.checkPolecatSessionHealth() + // Update state state.LastHeartbeat = time.Now() state.HeartbeatCount++ @@ -469,3 +474,150 @@ func StopDaemon(townRoot string) error { return nil } + +// checkPolecatSessionHealth proactively validates polecat tmux sessions. +// This detects crashed polecats that: +// 1. Have work-on-hook (assigned work) +// 2. Report state=running/working in their agent bead +// 3. But the tmux session is actually dead +// +// When a crash is detected, the polecat is automatically restarted. +// This provides faster recovery than waiting for GUPP timeout or Witness detection. +func (d *Daemon) checkPolecatSessionHealth() { + rigs := d.getKnownRigs() + for _, rigName := range rigs { + d.checkRigPolecatHealth(rigName) + } +} + +// checkRigPolecatHealth checks polecat session health for a specific rig. +func (d *Daemon) checkRigPolecatHealth(rigName string) { + // Get polecat directories for this rig + polecatsDir := filepath.Join(d.config.TownRoot, rigName, "polecats") + entries, err := os.ReadDir(polecatsDir) + if err != nil { + return // No polecats directory - rig might not have polecats + } + + for _, entry := range entries { + if !entry.IsDir() { + continue + } + polecatName := entry.Name() + d.checkPolecatHealth(rigName, polecatName) + } +} + +// checkPolecatHealth checks a single polecat's session health. +// If the polecat has work-on-hook but the tmux session is dead, it's restarted. +func (d *Daemon) checkPolecatHealth(rigName, polecatName string) { + // Build the expected tmux session name + sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName) + + // Check if tmux session exists + sessionAlive, err := d.tmux.HasSession(sessionName) + if err != nil { + d.logger.Printf("Error checking session %s: %v", sessionName, err) + return + } + + if sessionAlive { + // Session is alive - nothing to do + return + } + + // Session is dead. Check if the polecat has work-on-hook. + agentBeadID := beads.PolecatBeadID(rigName, polecatName) + info, err := d.getAgentBeadInfo(agentBeadID) + if err != nil { + // Agent bead doesn't exist or error - polecat might not be registered + return + } + + // Check if polecat has hooked work + if info.HookBead == "" { + // No hooked work - no need to restart (polecat was idle) + return + } + + // Polecat has work but session is dead - this is a crash! + d.logger.Printf("CRASH DETECTED: polecat %s/%s has hook_bead=%s but session %s is dead", + rigName, polecatName, info.HookBead, sessionName) + + // Auto-restart the polecat + if err := d.restartPolecatSession(rigName, polecatName, sessionName); err != nil { + d.logger.Printf("Error restarting polecat %s/%s: %v", rigName, polecatName, err) + // Notify witness as fallback + d.notifyWitnessOfCrashedPolecat(rigName, polecatName, info.HookBead, err) + } else { + d.logger.Printf("Successfully restarted crashed polecat %s/%s", rigName, polecatName) + } +} + +// restartPolecatSession restarts a crashed polecat session. +func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string) error { + // Determine working directory + workDir := filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName) + + // Verify the worktree exists + if _, err := os.Stat(workDir); os.IsNotExist(err) { + return fmt.Errorf("polecat worktree does not exist: %s", workDir) + } + + // Pre-sync workspace (ensure beads are current) + d.syncWorkspace(workDir) + + // Create new tmux session + if err := d.tmux.NewSession(sessionName, workDir); err != nil { + return fmt.Errorf("creating session: %w", err) + } + + // Set environment variables + _ = d.tmux.SetEnvironment(sessionName, "GT_ROLE", "polecat") + _ = d.tmux.SetEnvironment(sessionName, "GT_RIG", rigName) + _ = d.tmux.SetEnvironment(sessionName, "GT_POLECAT", polecatName) + + bdActor := fmt.Sprintf("%s/polecats/%s", rigName, polecatName) + _ = d.tmux.SetEnvironment(sessionName, "BD_ACTOR", bdActor) + + beadsDir := filepath.Join(d.config.TownRoot, rigName, ".beads") + _ = d.tmux.SetEnvironment(sessionName, "BEADS_DIR", beadsDir) + _ = d.tmux.SetEnvironment(sessionName, "BEADS_NO_DAEMON", "1") + _ = d.tmux.SetEnvironment(sessionName, "BEADS_AGENT_NAME", fmt.Sprintf("%s/%s", rigName, polecatName)) + + // Apply theme + theme := tmux.AssignTheme(rigName) + _ = d.tmux.ConfigureGasTownSession(sessionName, theme, rigName, polecatName, "polecat") + + // Set pane-died hook for future crash detection + agentID := fmt.Sprintf("%s/%s", rigName, polecatName) + _ = d.tmux.SetPaneDiedHook(sessionName, agentID) + + // Launch Claude with environment exported inline + startCmd := fmt.Sprintf("export GT_ROLE=polecat GT_RIG=%s GT_POLECAT=%s BD_ACTOR=%s && claude --dangerously-skip-permissions", + rigName, polecatName, bdActor) + if err := d.tmux.SendKeys(sessionName, startCmd); err != nil { + return fmt.Errorf("sending startup command: %w", err) + } + + return nil +} + +// notifyWitnessOfCrashedPolecat notifies the witness when a polecat restart fails. +func (d *Daemon) notifyWitnessOfCrashedPolecat(rigName, polecatName, hookBead string, restartErr error) { + witnessAddr := rigName + "/witness" + subject := fmt.Sprintf("CRASHED_POLECAT: %s/%s restart failed", rigName, polecatName) + body := fmt.Sprintf(`Polecat %s crashed and automatic restart failed. + +hook_bead: %s +restart_error: %v + +Manual intervention may be required.`, + polecatName, hookBead, restartErr) + + cmd := exec.Command("gt", "mail", "send", witnessAddr, "-s", subject, "-m", body) + cmd.Dir = d.config.TownRoot + if err := cmd.Run(); err != nil { + d.logger.Printf("Warning: failed to notify witness of crashed polecat: %v", err) + } +} diff --git a/internal/daemon/lifecycle.go b/internal/daemon/lifecycle.go index d7d2603b..758dc268 100644 --- a/internal/daemon/lifecycle.go +++ b/internal/daemon/lifecycle.go @@ -235,6 +235,21 @@ func (d *Daemon) identityToSession(identity string) string { if strings.Contains(identity, "-crew-") { return "gt-" + identity } + // Pattern: -polecat- or /polecats/ → gt-- + if strings.Contains(identity, "-polecat-") { + // -polecat- → gt-- + parts := strings.SplitN(identity, "-polecat-", 2) + if len(parts) == 2 { + return fmt.Sprintf("gt-%s-%s", parts[0], parts[1]) + } + } + if strings.Contains(identity, "/polecats/") { + // /polecats/ → gt-- + parts := strings.Split(identity, "/polecats/") + if len(parts) == 2 { + return fmt.Sprintf("gt-%s-%s", parts[0], parts[1]) + } + } // Unknown identity return "" } @@ -277,6 +292,31 @@ func (d *Daemon) restartSession(sessionName, identity string) error { startCmd = "exec claude --dangerously-skip-permissions" agentRole = "crew" needsPreSync = true + } else if strings.Contains(identity, "-polecat-") || strings.Contains(identity, "/polecats/") { + // Extract rig and polecat name from either format: + // -polecat- or /polecats/ + var polecatName string + if strings.Contains(identity, "-polecat-") { + parts := strings.SplitN(identity, "-polecat-", 2) + if len(parts) != 2 { + return fmt.Errorf("invalid polecat identity format: %s", identity) + } + rigName = parts[0] + polecatName = parts[1] + } else { + parts := strings.Split(identity, "/polecats/") + if len(parts) != 2 { + return fmt.Errorf("invalid polecat identity format: %s", identity) + } + rigName = parts[0] + polecatName = parts[1] + } + workDir = filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName) + bdActor := fmt.Sprintf("%s/polecats/%s", rigName, polecatName) + startCmd = fmt.Sprintf("export GT_ROLE=polecat GT_RIG=%s GT_POLECAT=%s BD_ACTOR=%s && claude --dangerously-skip-permissions", + rigName, polecatName, bdActor) + agentRole = "polecat" + needsPreSync = true } else { return fmt.Errorf("don't know how to restart %s", identity) } @@ -464,6 +504,24 @@ func (d *Daemon) identityToStateFile(identity string) string { return filepath.Join(d.config.TownRoot, rigName, "crew", crewName, "state.json") } } + // Pattern: -polecat-//polecats//state.json + if strings.Contains(identity, "-polecat-") { + parts := strings.SplitN(identity, "-polecat-", 2) + if len(parts) == 2 { + rigName := parts[0] + polecatName := parts[1] + return filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName, "state.json") + } + } + // Pattern: /polecats///polecats//state.json + if strings.Contains(identity, "/polecats/") { + parts := strings.Split(identity, "/polecats/") + if len(parts) == 2 { + rigName := parts[0] + polecatName := parts[1] + return filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName, "state.json") + } + } // Unknown identity - can't determine state file return "" } @@ -550,6 +608,7 @@ func (d *Daemon) getAgentBeadInfo(agentBeadID string) (*AgentBeadInfo, error) { // - "mayor" → "gt-mayor" // - "gastown-witness" → "gt-gastown-witness" // - "gastown-refinery" → "gt-gastown-refinery" +// - "gastown-polecat-toast" → "gt-polecat-gastown-toast" func (d *Daemon) identityToAgentBeadID(identity string) string { switch identity { case "deacon": @@ -574,6 +633,20 @@ func (d *Daemon) identityToAgentBeadID(identity string) string { return beads.CrewBeadID(parts[0], parts[1]) } } + // Pattern: -polecat- → gt-polecat-- + if strings.Contains(identity, "-polecat-") { + parts := strings.SplitN(identity, "-polecat-", 2) + if len(parts) == 2 { + return beads.PolecatBeadID(parts[0], parts[1]) + } + } + // Pattern: /polecats/ → gt-polecat-- + if strings.Contains(identity, "/polecats/") { + parts := strings.Split(identity, "/polecats/") + if len(parts) == 2 { + return beads.PolecatBeadID(parts[0], parts[1]) + } + } // Unknown format return "" } @@ -673,6 +746,7 @@ func (d *Daemon) markAgentDead(agentBeadID string) error { // - "gastown-witness" → "gastown/witness" // - "gastown-refinery" → "gastown/refinery" // - "gastown-crew-max" → "gastown/crew/max" +// - "gastown-polecat-toast" → "gastown/polecats/toast" func identityToBDActor(identity string) string { switch identity { case "mayor", "deacon": @@ -695,6 +769,17 @@ func identityToBDActor(identity string) string { return parts[0] + "/crew/" + parts[1] } } + // Pattern: -polecat-/polecats/ + if strings.Contains(identity, "-polecat-") { + parts := strings.SplitN(identity, "-polecat-", 2) + if len(parts) == 2 { + return parts[0] + "/polecats/" + parts[1] + } + } + // Identity already in slash format - return as-is + if strings.Contains(identity, "/polecats/") { + return identity + } // Unknown format - return as-is return identity }