feat(crash): improve crash logging and mass death detection

Add comprehensive crash logging improvements to help diagnose mass session death events:

- Add TypeSessionDeath and TypeMassDeath event types for feed visibility
- Log pre-death events before killing sessions (who killed, why)
- Add mass death detection in daemon (3+ deaths in 30s triggers alert)
- Add macOS crash report check in gt doctor
- Support session death events in townlog and feed curator

Closes hq-kt1o6

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
nux
2026-01-09 13:19:38 -08:00
committed by beads/crew/giles
parent 97b70517cc
commit 692d6819f2
8 changed files with 346 additions and 3 deletions

View File

@@ -132,6 +132,7 @@ func runDoctor(cmd *cobra.Command, args []string) error {
d.Register(doctor.NewIdentityCollisionCheck())
d.Register(doctor.NewLinkedPaneCheck())
d.Register(doctor.NewThemeCheck())
d.Register(doctor.NewCrashReportCheck())
// Patrol system checks
d.Register(doctor.NewPatrolMoleculesExistCheck())

View File

@@ -11,6 +11,7 @@ import (
"path/filepath"
"strconv"
"strings"
"sync"
"syscall"
"time"
@@ -20,6 +21,7 @@ import (
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/deacon"
"github.com/steveyegge/gastown/internal/events"
"github.com/steveyegge/gastown/internal/feed"
"github.com/steveyegge/gastown/internal/polecat"
"github.com/steveyegge/gastown/internal/refinery"
@@ -41,8 +43,24 @@ type Daemon struct {
ctx context.Context
cancel context.CancelFunc
curator *feed.Curator
// Mass death detection: track recent session deaths
deathsMu sync.Mutex
recentDeaths []sessionDeath
}
// sessionDeath records a detected session death for mass death analysis.
type sessionDeath struct {
sessionName string
timestamp time.Time
}
// Mass death detection parameters
const (
massDeathWindow = 30 * time.Second // Time window to detect mass death
massDeathThreshold = 3 // Number of deaths to trigger alert
)
// New creates a new daemon instance.
func New(config *Config) (*Daemon, error) {
// Ensure daemon directory exists
@@ -735,6 +753,9 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
d.logger.Printf("CRASH DETECTED: polecat %s/%s has hook_bead=%s but session %s is dead",
rigName, polecatName, info.HookBead, sessionName)
// Track this death for mass death detection
d.recordSessionDeath(sessionName)
// Auto-restart the polecat
if err := d.restartPolecatSession(rigName, polecatName, sessionName); err != nil {
d.logger.Printf("Error restarting polecat %s/%s: %v", rigName, polecatName, err)
@@ -745,6 +766,56 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
}
}
// recordSessionDeath records a session death and checks for mass death pattern.
func (d *Daemon) recordSessionDeath(sessionName string) {
d.deathsMu.Lock()
defer d.deathsMu.Unlock()
now := time.Now()
// Add this death
d.recentDeaths = append(d.recentDeaths, sessionDeath{
sessionName: sessionName,
timestamp: now,
})
// Prune deaths outside the window
cutoff := now.Add(-massDeathWindow)
var recent []sessionDeath
for _, death := range d.recentDeaths {
if death.timestamp.After(cutoff) {
recent = append(recent, death)
}
}
d.recentDeaths = recent
// Check for mass death
if len(d.recentDeaths) >= massDeathThreshold {
d.emitMassDeathEvent()
}
}
// emitMassDeathEvent logs a mass death event when multiple sessions die in a short window.
func (d *Daemon) emitMassDeathEvent() {
// Collect session names
var sessions []string
for _, death := range d.recentDeaths {
sessions = append(sessions, death.sessionName)
}
count := len(sessions)
window := massDeathWindow.String()
d.logger.Printf("MASS DEATH DETECTED: %d sessions died in %s: %v", count, window, sessions)
// Emit feed event
_ = events.LogFeed(events.TypeMassDeath, "daemon",
events.MassDeathPayload(count, window, sessions, ""))
// Clear the deaths to avoid repeated alerts
d.recentDeaths = nil
}
// restartPolecatSession restarts a crashed polecat session.
func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string) error {
// Check rig operational state before auto-restarting

View File

@@ -0,0 +1,185 @@
package doctor
import (
"fmt"
"os"
"path/filepath"
"runtime"
"sort"
"strings"
"time"
)
// CrashReportCheck looks for recent macOS crash reports related to tmux or Claude.
// This helps diagnose mass session death events.
type CrashReportCheck struct {
BaseCheck
crashReports []crashReport // Cached during Run for display
}
// crashReport represents a found crash report file.
type crashReport struct {
path string
name string
modTime time.Time
process string // "tmux", "claude", "node", etc.
}
// NewCrashReportCheck creates a new crash report check.
func NewCrashReportCheck() *CrashReportCheck {
return &CrashReportCheck{
BaseCheck: BaseCheck{
CheckName: "crash-reports",
CheckDescription: "Check for recent macOS crash reports (tmux, Claude)",
},
}
}
// Run checks for recent crash reports in macOS diagnostic directories.
func (c *CrashReportCheck) Run(ctx *CheckContext) *CheckResult {
// Only run on macOS
if runtime.GOOS != "darwin" {
return &CheckResult{
Name: c.Name(),
Status: StatusOK,
Message: "Crash report check not applicable (non-macOS)",
}
}
// Look for crash reports in the last 24 hours
lookbackWindow := 24 * time.Hour
cutoff := time.Now().Add(-lookbackWindow)
// macOS crash report locations
homeDir, err := os.UserHomeDir()
if err != nil {
return &CheckResult{
Name: c.Name(),
Status: StatusWarning,
Message: "Could not determine home directory",
Details: []string{err.Error()},
}
}
crashDirs := []string{
filepath.Join(homeDir, "Library", "Logs", "DiagnosticReports"),
"/Library/Logs/DiagnosticReports",
}
// Processes we care about
relevantProcesses := []string{
"tmux",
"claude",
"claude-code",
"node",
}
var reports []crashReport
for _, dir := range crashDirs {
entries, err := os.ReadDir(dir)
if err != nil {
continue // Directory may not exist
}
for _, entry := range entries {
if entry.IsDir() {
continue
}
name := entry.Name()
// Check if this is a crash report for a relevant process
var matchedProcess string
nameLower := strings.ToLower(name)
for _, proc := range relevantProcesses {
if strings.Contains(nameLower, proc) {
matchedProcess = proc
break
}
}
if matchedProcess == "" {
continue
}
// Check modification time
info, err := entry.Info()
if err != nil {
continue
}
if info.ModTime().Before(cutoff) {
continue // Too old
}
reports = append(reports, crashReport{
path: filepath.Join(dir, name),
name: name,
modTime: info.ModTime(),
process: matchedProcess,
})
}
}
// Sort by time (most recent first)
sort.Slice(reports, func(i, j int) bool {
return reports[i].modTime.After(reports[j].modTime)
})
// Cache for display
c.crashReports = reports
if len(reports) == 0 {
return &CheckResult{
Name: c.Name(),
Status: StatusOK,
Message: "No recent crash reports found",
}
}
// Group by process
processCounts := make(map[string]int)
for _, r := range reports {
processCounts[r.process]++
}
// Build details
var details []string
for _, r := range reports {
age := time.Since(r.modTime).Round(time.Minute)
details = append(details, fmt.Sprintf("%s (%s ago): %s", r.process, age, r.name))
}
// Build summary
var summary []string
for proc, count := range processCounts {
summary = append(summary, fmt.Sprintf("%d %s", count, proc))
}
message := fmt.Sprintf("Found %d crash report(s): %s", len(reports), strings.Join(summary, ", "))
// tmux crashes are particularly concerning
status := StatusWarning
if processCounts["tmux"] > 0 {
message += " - TMUX CRASHED (may explain session deaths)"
}
return &CheckResult{
Name: c.Name(),
Status: status,
Message: message,
Details: details,
FixHint: "Review crash reports in Console.app → User Reports or check ~/Library/Logs/DiagnosticReports/",
}
}
// Fix does nothing - crash reports are informational.
func (c *CrashReportCheck) Fix(ctx *CheckContext) error {
return nil
}
// CanFix returns false - crash reports cannot be auto-fixed.
func (c *CrashReportCheck) CanFix() bool {
return false
}

View File

@@ -8,6 +8,7 @@ import (
"regexp"
"strings"
"github.com/steveyegge/gastown/internal/events"
"github.com/steveyegge/gastown/internal/session"
"github.com/steveyegge/gastown/internal/tmux"
)
@@ -115,13 +116,16 @@ func (c *OrphanSessionCheck) Fix(ctx *CheckContext) error {
t := tmux.NewTmux()
var lastErr error
for _, session := range c.orphanSessions {
for _, sess := range c.orphanSessions {
// SAFEGUARD: Never auto-kill crew sessions.
// Crew workers are human-managed and require explicit action.
if isCrewSession(session) {
if isCrewSession(sess) {
continue
}
if err := t.KillSession(session); err != nil {
// Log pre-death event for crash investigation (before killing)
_ = events.LogFeed(events.TypeSessionDeath, sess,
events.SessionDeathPayload(sess, "unknown", "orphan cleanup", "gt doctor"))
if err := t.KillSession(sess); err != nil {
lastErr = err
}
}

View File

@@ -50,6 +50,10 @@ const (
TypeSessionStart = "session_start"
TypeSessionEnd = "session_end"
// Session death events (for crash investigation)
TypeSessionDeath = "session_death" // Feed-visible session termination
TypeMassDeath = "mass_death" // Multiple sessions died in short window
// Witness patrol events
TypePatrolStarted = "patrol_started"
TypePolecatChecked = "polecat_checked"
@@ -274,6 +278,37 @@ func HaltPayload(services []string) map[string]interface{} {
}
}
// SessionDeathPayload creates a payload for session death events.
// session: tmux session name that died
// agent: Gas Town agent identity (e.g., "gastown/polecats/Toast")
// reason: why the session was killed (e.g., "zombie cleanup", "user request", "doctor fix")
// caller: what initiated the kill (e.g., "daemon", "doctor", "gt down")
func SessionDeathPayload(session, agent, reason, caller string) map[string]interface{} {
return map[string]interface{}{
"session": session,
"agent": agent,
"reason": reason,
"caller": caller,
}
}
// MassDeathPayload creates a payload for mass death events.
// count: number of sessions that died
// window: time window in which deaths occurred (e.g., "5s")
// sessions: list of session names that died
// possibleCause: suspected cause if known
func MassDeathPayload(count int, window string, sessions []string, possibleCause string) map[string]interface{} {
p := map[string]interface{}{
"count": count,
"window": window,
"sessions": sessions,
}
if possibleCause != "" {
p["possible_cause"] = possibleCause
}
return p
}
// SessionPayload creates a payload for session start/end events.
// sessionID: Claude Code session UUID
// role: Gas Town role (e.g., "gastown/crew/joe", "deacon")

View File

@@ -346,6 +346,28 @@ func (c *Curator) generateSummary(event *events.Event) string {
}
return "Merge failed"
case events.TypeSessionDeath:
session, _ := event.Payload["session"].(string)
reason, _ := event.Payload["reason"].(string)
if session != "" && reason != "" {
return fmt.Sprintf("Session %s terminated: %s", session, reason)
}
if session != "" {
return fmt.Sprintf("Session %s terminated", session)
}
return "Session terminated"
case events.TypeMassDeath:
count, _ := event.Payload["count"].(float64) // JSON numbers are float64
possibleCause, _ := event.Payload["possible_cause"].(string)
if count > 0 && possibleCause != "" {
return fmt.Sprintf("MASS DEATH: %d sessions died - %s", int(count), possibleCause)
}
if count > 0 {
return fmt.Sprintf("MASS DEATH: %d sessions died simultaneously", int(count))
}
return "Multiple sessions died simultaneously"
default:
return fmt.Sprintf("%s: %s", event.Actor, event.Type)
}

View File

@@ -6,6 +6,7 @@ import (
"time"
"github.com/steveyegge/gastown/internal/boot"
"github.com/steveyegge/gastown/internal/events"
"github.com/steveyegge/gastown/internal/tmux"
)
@@ -44,6 +45,14 @@ func StopTownSession(t *tmux.Tmux, ts TownSession, force bool) (bool, error) {
time.Sleep(100 * time.Millisecond)
}
// Log pre-death event for crash investigation (before killing)
reason := "user shutdown"
if force {
reason = "forced shutdown"
}
_ = events.LogFeed(events.TypeSessionDeath, ts.SessionID,
events.SessionDeathPayload(ts.SessionID, ts.Name, reason, "gt down"))
// Kill the session
if err := t.KillSession(ts.SessionID); err != nil {
return false, fmt.Errorf("killing %s session: %w", ts.Name, err)

View File

@@ -36,6 +36,10 @@ const (
EventPolecatNudged EventType = "polecat_nudged"
EventEscalationSent EventType = "escalation_sent"
EventPatrolComplete EventType = "patrol_complete"
// Session death events (for crash investigation)
EventSessionDeath EventType = "session_death" // Session terminated (with reason)
EventMassDeath EventType = "mass_death" // Multiple sessions died in short window
)
// Event represents a single agent lifecycle event.
@@ -188,6 +192,18 @@ func formatLogLine(e Event) string {
} else {
detail = "patrol complete"
}
case EventSessionDeath:
if e.Context != "" {
detail = fmt.Sprintf("session terminated (%s)", e.Context)
} else {
detail = "session terminated"
}
case EventMassDeath:
if e.Context != "" {
detail = fmt.Sprintf("MASS SESSION DEATH (%s)", e.Context)
} else {
detail = "MASS SESSION DEATH"
}
default:
detail = string(e.Type)
if e.Context != "" {