fix: Add session health monitoring and auto-restart for crashed polecats (gt-i7wcn)

This fix addresses the issue where polecat sessions terminate unexpectedly
during work without recovery:

Changes:
- Add `checkPolecatSessionHealth()` to daemon heartbeat loop
  - Proactively validates tmux sessions are alive for polecats
  - Detects crashed polecats that have work-on-hook
  - Auto-restarts crashed polecats with proper environment setup
  - Notifies Witness if restart fails as fallback

- Add polecat support to lifecycle identity mapping
  - `identityToSession()` now handles polecat identities
  - `restartSession()` can restart crashed polecat sessions
  - `identityToStateFile()` handles polecat state files
  - `identityToAgentBeadID()` handles polecat agent beads
  - `identityToBDActor()` handles polecat BD_ACTOR conversion

- Add `gt session check` command for manual health checking
  - Validates tmux sessions exist for all polecats
  - Shows summary of healthy vs not-running sessions
  - Useful for debugging session issues

This provides faster recovery (within heartbeat interval) compared to
waiting for GUPP violation timeout (30 min) or Witness detection.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Steve Yegge
2025-12-29 22:07:45 -08:00
parent 3d2918443e
commit 85ec39c487
3 changed files with 345 additions and 0 deletions

View File

@@ -147,6 +147,24 @@ Displays running state, uptime, session info, and activity.`,
RunE: runSessionStatus,
}
var sessionCheckCmd = &cobra.Command{
Use: "check [rig]",
Short: "Check session health for polecats",
Long: `Check if polecat tmux sessions are alive and healthy.
This command validates that:
1. Polecats with work-on-hook have running tmux sessions
2. Sessions are responsive
Use this for manual health checks or debugging session issues.
Examples:
gt session check # Check all rigs
gt session check gastown # Check specific rig`,
Args: cobra.MaximumNArgs(1),
RunE: runSessionCheck,
}
func init() {
// Start flags
sessionStartCmd.Flags().StringVar(&sessionIssue, "issue", "", "Issue ID to work on")
@@ -177,6 +195,7 @@ func init() {
sessionCmd.AddCommand(sessionInjectCmd)
sessionCmd.AddCommand(sessionRestartCmd)
sessionCmd.AddCommand(sessionStatusCmd)
sessionCmd.AddCommand(sessionCheckCmd)
rootCmd.AddCommand(sessionCmd)
}
@@ -573,3 +592,92 @@ func formatDuration(d time.Duration) string {
}
return fmt.Sprintf("%dh %dm", hours, mins)
}
func runSessionCheck(cmd *cobra.Command, args []string) error {
// Find town root
townRoot, err := workspace.FindFromCwdOrError()
if err != nil {
return fmt.Errorf("not in a Gas Town workspace: %w", err)
}
// Load rigs config
rigsConfigPath := filepath.Join(townRoot, "mayor", "rigs.json")
rigsConfig, err := config.LoadRigsConfig(rigsConfigPath)
if err != nil {
rigsConfig = &config.RigsConfig{Rigs: make(map[string]config.RigEntry)}
}
// Get rigs to check
g := git.NewGit(townRoot)
rigMgr := rig.NewManager(townRoot, rigsConfig, g)
rigs, err := rigMgr.DiscoverRigs()
if err != nil {
return fmt.Errorf("discovering rigs: %w", err)
}
// Filter if specific rig requested
if len(args) > 0 {
rigFilter := args[0]
var filtered []*rig.Rig
for _, r := range rigs {
if r.Name == rigFilter {
filtered = append(filtered, r)
}
}
if len(filtered) == 0 {
return fmt.Errorf("rig not found: %s", rigFilter)
}
rigs = filtered
}
fmt.Printf("%s Session Health Check\n\n", style.Bold.Render("🔍"))
t := tmux.NewTmux()
totalChecked := 0
totalHealthy := 0
totalCrashed := 0
for _, r := range rigs {
polecatsDir := filepath.Join(r.Path, "polecats")
entries, err := os.ReadDir(polecatsDir)
if err != nil {
continue // Rig might not have polecats
}
for _, entry := range entries {
if !entry.IsDir() {
continue
}
polecatName := entry.Name()
sessionName := fmt.Sprintf("gt-%s-%s", r.Name, polecatName)
totalChecked++
// Check if session exists
running, err := t.HasSession(sessionName)
if err != nil {
fmt.Printf(" %s %s/%s: %s\n", style.Bold.Render("⚠"), r.Name, polecatName, style.Dim.Render("error checking session"))
continue
}
if running {
fmt.Printf(" %s %s/%s: %s\n", style.Bold.Render("✓"), r.Name, polecatName, style.Dim.Render("session alive"))
totalHealthy++
} else {
// Check if polecat has work on hook (would need restart)
fmt.Printf(" %s %s/%s: %s\n", style.Bold.Render("✗"), r.Name, polecatName, style.Dim.Render("session not running"))
totalCrashed++
}
}
}
// Summary
fmt.Printf("\n%s Summary: %d checked, %d healthy, %d not running\n",
style.Bold.Render("📊"), totalChecked, totalHealthy, totalCrashed)
if totalCrashed > 0 {
fmt.Printf("\n%s To restart crashed polecats: gt session restart <rig>/<polecat>\n",
style.Dim.Render("Tip:"))
}
return nil
}

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"log"
"os"
"os/exec"
"os/signal"
"path/filepath"
"strconv"
@@ -202,6 +203,10 @@ func (d *Daemon) heartbeat(state *State) {
// 7. Check for orphaned work (assigned to dead agents)
d.checkOrphanedWork()
// 8. Check polecat session health (proactive crash detection)
// This validates tmux sessions are still alive for polecats with work-on-hook
d.checkPolecatSessionHealth()
// Update state
state.LastHeartbeat = time.Now()
state.HeartbeatCount++
@@ -469,3 +474,150 @@ func StopDaemon(townRoot string) error {
return nil
}
// checkPolecatSessionHealth proactively validates polecat tmux sessions.
// This detects crashed polecats that:
// 1. Have work-on-hook (assigned work)
// 2. Report state=running/working in their agent bead
// 3. But the tmux session is actually dead
//
// When a crash is detected, the polecat is automatically restarted.
// This provides faster recovery than waiting for GUPP timeout or Witness detection.
func (d *Daemon) checkPolecatSessionHealth() {
rigs := d.getKnownRigs()
for _, rigName := range rigs {
d.checkRigPolecatHealth(rigName)
}
}
// checkRigPolecatHealth checks polecat session health for a specific rig.
func (d *Daemon) checkRigPolecatHealth(rigName string) {
// Get polecat directories for this rig
polecatsDir := filepath.Join(d.config.TownRoot, rigName, "polecats")
entries, err := os.ReadDir(polecatsDir)
if err != nil {
return // No polecats directory - rig might not have polecats
}
for _, entry := range entries {
if !entry.IsDir() {
continue
}
polecatName := entry.Name()
d.checkPolecatHealth(rigName, polecatName)
}
}
// checkPolecatHealth checks a single polecat's session health.
// If the polecat has work-on-hook but the tmux session is dead, it's restarted.
func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
// Build the expected tmux session name
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
// Check if tmux session exists
sessionAlive, err := d.tmux.HasSession(sessionName)
if err != nil {
d.logger.Printf("Error checking session %s: %v", sessionName, err)
return
}
if sessionAlive {
// Session is alive - nothing to do
return
}
// Session is dead. Check if the polecat has work-on-hook.
agentBeadID := beads.PolecatBeadID(rigName, polecatName)
info, err := d.getAgentBeadInfo(agentBeadID)
if err != nil {
// Agent bead doesn't exist or error - polecat might not be registered
return
}
// Check if polecat has hooked work
if info.HookBead == "" {
// No hooked work - no need to restart (polecat was idle)
return
}
// Polecat has work but session is dead - this is a crash!
d.logger.Printf("CRASH DETECTED: polecat %s/%s has hook_bead=%s but session %s is dead",
rigName, polecatName, info.HookBead, sessionName)
// Auto-restart the polecat
if err := d.restartPolecatSession(rigName, polecatName, sessionName); err != nil {
d.logger.Printf("Error restarting polecat %s/%s: %v", rigName, polecatName, err)
// Notify witness as fallback
d.notifyWitnessOfCrashedPolecat(rigName, polecatName, info.HookBead, err)
} else {
d.logger.Printf("Successfully restarted crashed polecat %s/%s", rigName, polecatName)
}
}
// restartPolecatSession restarts a crashed polecat session.
func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string) error {
// Determine working directory
workDir := filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName)
// Verify the worktree exists
if _, err := os.Stat(workDir); os.IsNotExist(err) {
return fmt.Errorf("polecat worktree does not exist: %s", workDir)
}
// Pre-sync workspace (ensure beads are current)
d.syncWorkspace(workDir)
// Create new tmux session
if err := d.tmux.NewSession(sessionName, workDir); err != nil {
return fmt.Errorf("creating session: %w", err)
}
// Set environment variables
_ = d.tmux.SetEnvironment(sessionName, "GT_ROLE", "polecat")
_ = d.tmux.SetEnvironment(sessionName, "GT_RIG", rigName)
_ = d.tmux.SetEnvironment(sessionName, "GT_POLECAT", polecatName)
bdActor := fmt.Sprintf("%s/polecats/%s", rigName, polecatName)
_ = d.tmux.SetEnvironment(sessionName, "BD_ACTOR", bdActor)
beadsDir := filepath.Join(d.config.TownRoot, rigName, ".beads")
_ = d.tmux.SetEnvironment(sessionName, "BEADS_DIR", beadsDir)
_ = d.tmux.SetEnvironment(sessionName, "BEADS_NO_DAEMON", "1")
_ = d.tmux.SetEnvironment(sessionName, "BEADS_AGENT_NAME", fmt.Sprintf("%s/%s", rigName, polecatName))
// Apply theme
theme := tmux.AssignTheme(rigName)
_ = d.tmux.ConfigureGasTownSession(sessionName, theme, rigName, polecatName, "polecat")
// Set pane-died hook for future crash detection
agentID := fmt.Sprintf("%s/%s", rigName, polecatName)
_ = d.tmux.SetPaneDiedHook(sessionName, agentID)
// Launch Claude with environment exported inline
startCmd := fmt.Sprintf("export GT_ROLE=polecat GT_RIG=%s GT_POLECAT=%s BD_ACTOR=%s && claude --dangerously-skip-permissions",
rigName, polecatName, bdActor)
if err := d.tmux.SendKeys(sessionName, startCmd); err != nil {
return fmt.Errorf("sending startup command: %w", err)
}
return nil
}
// notifyWitnessOfCrashedPolecat notifies the witness when a polecat restart fails.
func (d *Daemon) notifyWitnessOfCrashedPolecat(rigName, polecatName, hookBead string, restartErr error) {
witnessAddr := rigName + "/witness"
subject := fmt.Sprintf("CRASHED_POLECAT: %s/%s restart failed", rigName, polecatName)
body := fmt.Sprintf(`Polecat %s crashed and automatic restart failed.
hook_bead: %s
restart_error: %v
Manual intervention may be required.`,
polecatName, hookBead, restartErr)
cmd := exec.Command("gt", "mail", "send", witnessAddr, "-s", subject, "-m", body)
cmd.Dir = d.config.TownRoot
if err := cmd.Run(); err != nil {
d.logger.Printf("Warning: failed to notify witness of crashed polecat: %v", err)
}
}

View File

@@ -235,6 +235,21 @@ func (d *Daemon) identityToSession(identity string) string {
if strings.Contains(identity, "-crew-") {
return "gt-" + identity
}
// Pattern: <rig>-polecat-<name> or <rig>/polecats/<name> → gt-<rig>-<name>
if strings.Contains(identity, "-polecat-") {
// <rig>-polecat-<name> → gt-<rig>-<name>
parts := strings.SplitN(identity, "-polecat-", 2)
if len(parts) == 2 {
return fmt.Sprintf("gt-%s-%s", parts[0], parts[1])
}
}
if strings.Contains(identity, "/polecats/") {
// <rig>/polecats/<name> → gt-<rig>-<name>
parts := strings.Split(identity, "/polecats/")
if len(parts) == 2 {
return fmt.Sprintf("gt-%s-%s", parts[0], parts[1])
}
}
// Unknown identity
return ""
}
@@ -277,6 +292,31 @@ func (d *Daemon) restartSession(sessionName, identity string) error {
startCmd = "exec claude --dangerously-skip-permissions"
agentRole = "crew"
needsPreSync = true
} else if strings.Contains(identity, "-polecat-") || strings.Contains(identity, "/polecats/") {
// Extract rig and polecat name from either format:
// <rig>-polecat-<name> or <rig>/polecats/<name>
var polecatName string
if strings.Contains(identity, "-polecat-") {
parts := strings.SplitN(identity, "-polecat-", 2)
if len(parts) != 2 {
return fmt.Errorf("invalid polecat identity format: %s", identity)
}
rigName = parts[0]
polecatName = parts[1]
} else {
parts := strings.Split(identity, "/polecats/")
if len(parts) != 2 {
return fmt.Errorf("invalid polecat identity format: %s", identity)
}
rigName = parts[0]
polecatName = parts[1]
}
workDir = filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName)
bdActor := fmt.Sprintf("%s/polecats/%s", rigName, polecatName)
startCmd = fmt.Sprintf("export GT_ROLE=polecat GT_RIG=%s GT_POLECAT=%s BD_ACTOR=%s && claude --dangerously-skip-permissions",
rigName, polecatName, bdActor)
agentRole = "polecat"
needsPreSync = true
} else {
return fmt.Errorf("don't know how to restart %s", identity)
}
@@ -464,6 +504,24 @@ func (d *Daemon) identityToStateFile(identity string) string {
return filepath.Join(d.config.TownRoot, rigName, "crew", crewName, "state.json")
}
}
// Pattern: <rig>-polecat-<name> → <townRoot>/<rig>/polecats/<name>/state.json
if strings.Contains(identity, "-polecat-") {
parts := strings.SplitN(identity, "-polecat-", 2)
if len(parts) == 2 {
rigName := parts[0]
polecatName := parts[1]
return filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName, "state.json")
}
}
// Pattern: <rig>/polecats/<name> → <townRoot>/<rig>/polecats/<name>/state.json
if strings.Contains(identity, "/polecats/") {
parts := strings.Split(identity, "/polecats/")
if len(parts) == 2 {
rigName := parts[0]
polecatName := parts[1]
return filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName, "state.json")
}
}
// Unknown identity - can't determine state file
return ""
}
@@ -550,6 +608,7 @@ func (d *Daemon) getAgentBeadInfo(agentBeadID string) (*AgentBeadInfo, error) {
// - "mayor" → "gt-mayor"
// - "gastown-witness" → "gt-gastown-witness"
// - "gastown-refinery" → "gt-gastown-refinery"
// - "gastown-polecat-toast" → "gt-polecat-gastown-toast"
func (d *Daemon) identityToAgentBeadID(identity string) string {
switch identity {
case "deacon":
@@ -574,6 +633,20 @@ func (d *Daemon) identityToAgentBeadID(identity string) string {
return beads.CrewBeadID(parts[0], parts[1])
}
}
// Pattern: <rig>-polecat-<name> → gt-polecat-<rig>-<name>
if strings.Contains(identity, "-polecat-") {
parts := strings.SplitN(identity, "-polecat-", 2)
if len(parts) == 2 {
return beads.PolecatBeadID(parts[0], parts[1])
}
}
// Pattern: <rig>/polecats/<name> → gt-polecat-<rig>-<name>
if strings.Contains(identity, "/polecats/") {
parts := strings.Split(identity, "/polecats/")
if len(parts) == 2 {
return beads.PolecatBeadID(parts[0], parts[1])
}
}
// Unknown format
return ""
}
@@ -673,6 +746,7 @@ func (d *Daemon) markAgentDead(agentBeadID string) error {
// - "gastown-witness" → "gastown/witness"
// - "gastown-refinery" → "gastown/refinery"
// - "gastown-crew-max" → "gastown/crew/max"
// - "gastown-polecat-toast" → "gastown/polecats/toast"
func identityToBDActor(identity string) string {
switch identity {
case "mayor", "deacon":
@@ -695,6 +769,17 @@ func identityToBDActor(identity string) string {
return parts[0] + "/crew/" + parts[1]
}
}
// Pattern: <rig>-polecat-<name> → <rig>/polecats/<name>
if strings.Contains(identity, "-polecat-") {
parts := strings.SplitN(identity, "-polecat-", 2)
if len(parts) == 2 {
return parts[0] + "/polecats/" + parts[1]
}
}
// Identity already in slash format - return as-is
if strings.Contains(identity, "/polecats/") {
return identity
}
// Unknown format - return as-is
return identity
}