fix: Add session health monitoring and auto-restart for crashed polecats (gt-i7wcn)
This fix addresses the issue where polecat sessions terminate unexpectedly during work without recovery: Changes: - Add `checkPolecatSessionHealth()` to daemon heartbeat loop - Proactively validates tmux sessions are alive for polecats - Detects crashed polecats that have work-on-hook - Auto-restarts crashed polecats with proper environment setup - Notifies Witness if restart fails as fallback - Add polecat support to lifecycle identity mapping - `identityToSession()` now handles polecat identities - `restartSession()` can restart crashed polecat sessions - `identityToStateFile()` handles polecat state files - `identityToAgentBeadID()` handles polecat agent beads - `identityToBDActor()` handles polecat BD_ACTOR conversion - Add `gt session check` command for manual health checking - Validates tmux sessions exist for all polecats - Shows summary of healthy vs not-running sessions - Useful for debugging session issues This provides faster recovery (within heartbeat interval) compared to waiting for GUPP violation timeout (30 min) or Witness detection. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
@@ -202,6 +203,10 @@ func (d *Daemon) heartbeat(state *State) {
|
||||
// 7. Check for orphaned work (assigned to dead agents)
|
||||
d.checkOrphanedWork()
|
||||
|
||||
// 8. Check polecat session health (proactive crash detection)
|
||||
// This validates tmux sessions are still alive for polecats with work-on-hook
|
||||
d.checkPolecatSessionHealth()
|
||||
|
||||
// Update state
|
||||
state.LastHeartbeat = time.Now()
|
||||
state.HeartbeatCount++
|
||||
@@ -469,3 +474,150 @@ func StopDaemon(townRoot string) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// checkPolecatSessionHealth proactively validates polecat tmux sessions.
|
||||
// This detects crashed polecats that:
|
||||
// 1. Have work-on-hook (assigned work)
|
||||
// 2. Report state=running/working in their agent bead
|
||||
// 3. But the tmux session is actually dead
|
||||
//
|
||||
// When a crash is detected, the polecat is automatically restarted.
|
||||
// This provides faster recovery than waiting for GUPP timeout or Witness detection.
|
||||
func (d *Daemon) checkPolecatSessionHealth() {
|
||||
rigs := d.getKnownRigs()
|
||||
for _, rigName := range rigs {
|
||||
d.checkRigPolecatHealth(rigName)
|
||||
}
|
||||
}
|
||||
|
||||
// checkRigPolecatHealth checks polecat session health for a specific rig.
|
||||
func (d *Daemon) checkRigPolecatHealth(rigName string) {
|
||||
// Get polecat directories for this rig
|
||||
polecatsDir := filepath.Join(d.config.TownRoot, rigName, "polecats")
|
||||
entries, err := os.ReadDir(polecatsDir)
|
||||
if err != nil {
|
||||
return // No polecats directory - rig might not have polecats
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() {
|
||||
continue
|
||||
}
|
||||
polecatName := entry.Name()
|
||||
d.checkPolecatHealth(rigName, polecatName)
|
||||
}
|
||||
}
|
||||
|
||||
// checkPolecatHealth checks a single polecat's session health.
|
||||
// If the polecat has work-on-hook but the tmux session is dead, it's restarted.
|
||||
func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
|
||||
// Build the expected tmux session name
|
||||
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
|
||||
|
||||
// Check if tmux session exists
|
||||
sessionAlive, err := d.tmux.HasSession(sessionName)
|
||||
if err != nil {
|
||||
d.logger.Printf("Error checking session %s: %v", sessionName, err)
|
||||
return
|
||||
}
|
||||
|
||||
if sessionAlive {
|
||||
// Session is alive - nothing to do
|
||||
return
|
||||
}
|
||||
|
||||
// Session is dead. Check if the polecat has work-on-hook.
|
||||
agentBeadID := beads.PolecatBeadID(rigName, polecatName)
|
||||
info, err := d.getAgentBeadInfo(agentBeadID)
|
||||
if err != nil {
|
||||
// Agent bead doesn't exist or error - polecat might not be registered
|
||||
return
|
||||
}
|
||||
|
||||
// Check if polecat has hooked work
|
||||
if info.HookBead == "" {
|
||||
// No hooked work - no need to restart (polecat was idle)
|
||||
return
|
||||
}
|
||||
|
||||
// Polecat has work but session is dead - this is a crash!
|
||||
d.logger.Printf("CRASH DETECTED: polecat %s/%s has hook_bead=%s but session %s is dead",
|
||||
rigName, polecatName, info.HookBead, sessionName)
|
||||
|
||||
// Auto-restart the polecat
|
||||
if err := d.restartPolecatSession(rigName, polecatName, sessionName); err != nil {
|
||||
d.logger.Printf("Error restarting polecat %s/%s: %v", rigName, polecatName, err)
|
||||
// Notify witness as fallback
|
||||
d.notifyWitnessOfCrashedPolecat(rigName, polecatName, info.HookBead, err)
|
||||
} else {
|
||||
d.logger.Printf("Successfully restarted crashed polecat %s/%s", rigName, polecatName)
|
||||
}
|
||||
}
|
||||
|
||||
// restartPolecatSession restarts a crashed polecat session.
|
||||
func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string) error {
|
||||
// Determine working directory
|
||||
workDir := filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName)
|
||||
|
||||
// Verify the worktree exists
|
||||
if _, err := os.Stat(workDir); os.IsNotExist(err) {
|
||||
return fmt.Errorf("polecat worktree does not exist: %s", workDir)
|
||||
}
|
||||
|
||||
// Pre-sync workspace (ensure beads are current)
|
||||
d.syncWorkspace(workDir)
|
||||
|
||||
// Create new tmux session
|
||||
if err := d.tmux.NewSession(sessionName, workDir); err != nil {
|
||||
return fmt.Errorf("creating session: %w", err)
|
||||
}
|
||||
|
||||
// Set environment variables
|
||||
_ = d.tmux.SetEnvironment(sessionName, "GT_ROLE", "polecat")
|
||||
_ = d.tmux.SetEnvironment(sessionName, "GT_RIG", rigName)
|
||||
_ = d.tmux.SetEnvironment(sessionName, "GT_POLECAT", polecatName)
|
||||
|
||||
bdActor := fmt.Sprintf("%s/polecats/%s", rigName, polecatName)
|
||||
_ = d.tmux.SetEnvironment(sessionName, "BD_ACTOR", bdActor)
|
||||
|
||||
beadsDir := filepath.Join(d.config.TownRoot, rigName, ".beads")
|
||||
_ = d.tmux.SetEnvironment(sessionName, "BEADS_DIR", beadsDir)
|
||||
_ = d.tmux.SetEnvironment(sessionName, "BEADS_NO_DAEMON", "1")
|
||||
_ = d.tmux.SetEnvironment(sessionName, "BEADS_AGENT_NAME", fmt.Sprintf("%s/%s", rigName, polecatName))
|
||||
|
||||
// Apply theme
|
||||
theme := tmux.AssignTheme(rigName)
|
||||
_ = d.tmux.ConfigureGasTownSession(sessionName, theme, rigName, polecatName, "polecat")
|
||||
|
||||
// Set pane-died hook for future crash detection
|
||||
agentID := fmt.Sprintf("%s/%s", rigName, polecatName)
|
||||
_ = d.tmux.SetPaneDiedHook(sessionName, agentID)
|
||||
|
||||
// Launch Claude with environment exported inline
|
||||
startCmd := fmt.Sprintf("export GT_ROLE=polecat GT_RIG=%s GT_POLECAT=%s BD_ACTOR=%s && claude --dangerously-skip-permissions",
|
||||
rigName, polecatName, bdActor)
|
||||
if err := d.tmux.SendKeys(sessionName, startCmd); err != nil {
|
||||
return fmt.Errorf("sending startup command: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// notifyWitnessOfCrashedPolecat notifies the witness when a polecat restart fails.
|
||||
func (d *Daemon) notifyWitnessOfCrashedPolecat(rigName, polecatName, hookBead string, restartErr error) {
|
||||
witnessAddr := rigName + "/witness"
|
||||
subject := fmt.Sprintf("CRASHED_POLECAT: %s/%s restart failed", rigName, polecatName)
|
||||
body := fmt.Sprintf(`Polecat %s crashed and automatic restart failed.
|
||||
|
||||
hook_bead: %s
|
||||
restart_error: %v
|
||||
|
||||
Manual intervention may be required.`,
|
||||
polecatName, hookBead, restartErr)
|
||||
|
||||
cmd := exec.Command("gt", "mail", "send", witnessAddr, "-s", subject, "-m", body)
|
||||
cmd.Dir = d.config.TownRoot
|
||||
if err := cmd.Run(); err != nil {
|
||||
d.logger.Printf("Warning: failed to notify witness of crashed polecat: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,6 +235,21 @@ func (d *Daemon) identityToSession(identity string) string {
|
||||
if strings.Contains(identity, "-crew-") {
|
||||
return "gt-" + identity
|
||||
}
|
||||
// Pattern: <rig>-polecat-<name> or <rig>/polecats/<name> → gt-<rig>-<name>
|
||||
if strings.Contains(identity, "-polecat-") {
|
||||
// <rig>-polecat-<name> → gt-<rig>-<name>
|
||||
parts := strings.SplitN(identity, "-polecat-", 2)
|
||||
if len(parts) == 2 {
|
||||
return fmt.Sprintf("gt-%s-%s", parts[0], parts[1])
|
||||
}
|
||||
}
|
||||
if strings.Contains(identity, "/polecats/") {
|
||||
// <rig>/polecats/<name> → gt-<rig>-<name>
|
||||
parts := strings.Split(identity, "/polecats/")
|
||||
if len(parts) == 2 {
|
||||
return fmt.Sprintf("gt-%s-%s", parts[0], parts[1])
|
||||
}
|
||||
}
|
||||
// Unknown identity
|
||||
return ""
|
||||
}
|
||||
@@ -277,6 +292,31 @@ func (d *Daemon) restartSession(sessionName, identity string) error {
|
||||
startCmd = "exec claude --dangerously-skip-permissions"
|
||||
agentRole = "crew"
|
||||
needsPreSync = true
|
||||
} else if strings.Contains(identity, "-polecat-") || strings.Contains(identity, "/polecats/") {
|
||||
// Extract rig and polecat name from either format:
|
||||
// <rig>-polecat-<name> or <rig>/polecats/<name>
|
||||
var polecatName string
|
||||
if strings.Contains(identity, "-polecat-") {
|
||||
parts := strings.SplitN(identity, "-polecat-", 2)
|
||||
if len(parts) != 2 {
|
||||
return fmt.Errorf("invalid polecat identity format: %s", identity)
|
||||
}
|
||||
rigName = parts[0]
|
||||
polecatName = parts[1]
|
||||
} else {
|
||||
parts := strings.Split(identity, "/polecats/")
|
||||
if len(parts) != 2 {
|
||||
return fmt.Errorf("invalid polecat identity format: %s", identity)
|
||||
}
|
||||
rigName = parts[0]
|
||||
polecatName = parts[1]
|
||||
}
|
||||
workDir = filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName)
|
||||
bdActor := fmt.Sprintf("%s/polecats/%s", rigName, polecatName)
|
||||
startCmd = fmt.Sprintf("export GT_ROLE=polecat GT_RIG=%s GT_POLECAT=%s BD_ACTOR=%s && claude --dangerously-skip-permissions",
|
||||
rigName, polecatName, bdActor)
|
||||
agentRole = "polecat"
|
||||
needsPreSync = true
|
||||
} else {
|
||||
return fmt.Errorf("don't know how to restart %s", identity)
|
||||
}
|
||||
@@ -464,6 +504,24 @@ func (d *Daemon) identityToStateFile(identity string) string {
|
||||
return filepath.Join(d.config.TownRoot, rigName, "crew", crewName, "state.json")
|
||||
}
|
||||
}
|
||||
// Pattern: <rig>-polecat-<name> → <townRoot>/<rig>/polecats/<name>/state.json
|
||||
if strings.Contains(identity, "-polecat-") {
|
||||
parts := strings.SplitN(identity, "-polecat-", 2)
|
||||
if len(parts) == 2 {
|
||||
rigName := parts[0]
|
||||
polecatName := parts[1]
|
||||
return filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName, "state.json")
|
||||
}
|
||||
}
|
||||
// Pattern: <rig>/polecats/<name> → <townRoot>/<rig>/polecats/<name>/state.json
|
||||
if strings.Contains(identity, "/polecats/") {
|
||||
parts := strings.Split(identity, "/polecats/")
|
||||
if len(parts) == 2 {
|
||||
rigName := parts[0]
|
||||
polecatName := parts[1]
|
||||
return filepath.Join(d.config.TownRoot, rigName, "polecats", polecatName, "state.json")
|
||||
}
|
||||
}
|
||||
// Unknown identity - can't determine state file
|
||||
return ""
|
||||
}
|
||||
@@ -550,6 +608,7 @@ func (d *Daemon) getAgentBeadInfo(agentBeadID string) (*AgentBeadInfo, error) {
|
||||
// - "mayor" → "gt-mayor"
|
||||
// - "gastown-witness" → "gt-gastown-witness"
|
||||
// - "gastown-refinery" → "gt-gastown-refinery"
|
||||
// - "gastown-polecat-toast" → "gt-polecat-gastown-toast"
|
||||
func (d *Daemon) identityToAgentBeadID(identity string) string {
|
||||
switch identity {
|
||||
case "deacon":
|
||||
@@ -574,6 +633,20 @@ func (d *Daemon) identityToAgentBeadID(identity string) string {
|
||||
return beads.CrewBeadID(parts[0], parts[1])
|
||||
}
|
||||
}
|
||||
// Pattern: <rig>-polecat-<name> → gt-polecat-<rig>-<name>
|
||||
if strings.Contains(identity, "-polecat-") {
|
||||
parts := strings.SplitN(identity, "-polecat-", 2)
|
||||
if len(parts) == 2 {
|
||||
return beads.PolecatBeadID(parts[0], parts[1])
|
||||
}
|
||||
}
|
||||
// Pattern: <rig>/polecats/<name> → gt-polecat-<rig>-<name>
|
||||
if strings.Contains(identity, "/polecats/") {
|
||||
parts := strings.Split(identity, "/polecats/")
|
||||
if len(parts) == 2 {
|
||||
return beads.PolecatBeadID(parts[0], parts[1])
|
||||
}
|
||||
}
|
||||
// Unknown format
|
||||
return ""
|
||||
}
|
||||
@@ -673,6 +746,7 @@ func (d *Daemon) markAgentDead(agentBeadID string) error {
|
||||
// - "gastown-witness" → "gastown/witness"
|
||||
// - "gastown-refinery" → "gastown/refinery"
|
||||
// - "gastown-crew-max" → "gastown/crew/max"
|
||||
// - "gastown-polecat-toast" → "gastown/polecats/toast"
|
||||
func identityToBDActor(identity string) string {
|
||||
switch identity {
|
||||
case "mayor", "deacon":
|
||||
@@ -695,6 +769,17 @@ func identityToBDActor(identity string) string {
|
||||
return parts[0] + "/crew/" + parts[1]
|
||||
}
|
||||
}
|
||||
// Pattern: <rig>-polecat-<name> → <rig>/polecats/<name>
|
||||
if strings.Contains(identity, "-polecat-") {
|
||||
parts := strings.SplitN(identity, "-polecat-", 2)
|
||||
if len(parts) == 2 {
|
||||
return parts[0] + "/polecats/" + parts[1]
|
||||
}
|
||||
}
|
||||
// Identity already in slash format - return as-is
|
||||
if strings.Contains(identity, "/polecats/") {
|
||||
return identity
|
||||
}
|
||||
// Unknown format - return as-is
|
||||
return identity
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user