diff --git a/internal/cmd/doctor.go b/internal/cmd/doctor.go index 81e655c5..e1ffd444 100644 --- a/internal/cmd/doctor.go +++ b/internal/cmd/doctor.go @@ -132,6 +132,7 @@ func runDoctor(cmd *cobra.Command, args []string) error { d.Register(doctor.NewIdentityCollisionCheck()) d.Register(doctor.NewLinkedPaneCheck()) d.Register(doctor.NewThemeCheck()) + d.Register(doctor.NewCrashReportCheck()) // Patrol system checks d.Register(doctor.NewPatrolMoleculesExistCheck()) diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 2b12f40c..0a9186a2 100755 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -11,6 +11,7 @@ import ( "path/filepath" "strconv" "strings" + "sync" "syscall" "time" @@ -20,6 +21,7 @@ import ( "github.com/steveyegge/gastown/internal/config" "github.com/steveyegge/gastown/internal/constants" "github.com/steveyegge/gastown/internal/deacon" + "github.com/steveyegge/gastown/internal/events" "github.com/steveyegge/gastown/internal/feed" "github.com/steveyegge/gastown/internal/polecat" "github.com/steveyegge/gastown/internal/refinery" @@ -41,8 +43,24 @@ type Daemon struct { ctx context.Context cancel context.CancelFunc curator *feed.Curator + + // Mass death detection: track recent session deaths + deathsMu sync.Mutex + recentDeaths []sessionDeath } +// sessionDeath records a detected session death for mass death analysis. +type sessionDeath struct { + sessionName string + timestamp time.Time +} + +// Mass death detection parameters +const ( + massDeathWindow = 30 * time.Second // Time window to detect mass death + massDeathThreshold = 3 // Number of deaths to trigger alert +) + // New creates a new daemon instance. func New(config *Config) (*Daemon, error) { // Ensure daemon directory exists @@ -735,6 +753,9 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) { d.logger.Printf("CRASH DETECTED: polecat %s/%s has hook_bead=%s but session %s is dead", rigName, polecatName, info.HookBead, sessionName) + // Track this death for mass death detection + d.recordSessionDeath(sessionName) + // Auto-restart the polecat if err := d.restartPolecatSession(rigName, polecatName, sessionName); err != nil { d.logger.Printf("Error restarting polecat %s/%s: %v", rigName, polecatName, err) @@ -745,6 +766,56 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) { } } +// recordSessionDeath records a session death and checks for mass death pattern. +func (d *Daemon) recordSessionDeath(sessionName string) { + d.deathsMu.Lock() + defer d.deathsMu.Unlock() + + now := time.Now() + + // Add this death + d.recentDeaths = append(d.recentDeaths, sessionDeath{ + sessionName: sessionName, + timestamp: now, + }) + + // Prune deaths outside the window + cutoff := now.Add(-massDeathWindow) + var recent []sessionDeath + for _, death := range d.recentDeaths { + if death.timestamp.After(cutoff) { + recent = append(recent, death) + } + } + d.recentDeaths = recent + + // Check for mass death + if len(d.recentDeaths) >= massDeathThreshold { + d.emitMassDeathEvent() + } +} + +// emitMassDeathEvent logs a mass death event when multiple sessions die in a short window. +func (d *Daemon) emitMassDeathEvent() { + // Collect session names + var sessions []string + for _, death := range d.recentDeaths { + sessions = append(sessions, death.sessionName) + } + + count := len(sessions) + window := massDeathWindow.String() + + d.logger.Printf("MASS DEATH DETECTED: %d sessions died in %s: %v", count, window, sessions) + + // Emit feed event + _ = events.LogFeed(events.TypeMassDeath, "daemon", + events.MassDeathPayload(count, window, sessions, "")) + + // Clear the deaths to avoid repeated alerts + d.recentDeaths = nil +} + // restartPolecatSession restarts a crashed polecat session. func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string) error { // Check rig operational state before auto-restarting diff --git a/internal/doctor/crash_report_check.go b/internal/doctor/crash_report_check.go new file mode 100644 index 00000000..05e7cee6 --- /dev/null +++ b/internal/doctor/crash_report_check.go @@ -0,0 +1,185 @@ +package doctor + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + "sort" + "strings" + "time" +) + +// CrashReportCheck looks for recent macOS crash reports related to tmux or Claude. +// This helps diagnose mass session death events. +type CrashReportCheck struct { + BaseCheck + crashReports []crashReport // Cached during Run for display +} + +// crashReport represents a found crash report file. +type crashReport struct { + path string + name string + modTime time.Time + process string // "tmux", "claude", "node", etc. +} + +// NewCrashReportCheck creates a new crash report check. +func NewCrashReportCheck() *CrashReportCheck { + return &CrashReportCheck{ + BaseCheck: BaseCheck{ + CheckName: "crash-reports", + CheckDescription: "Check for recent macOS crash reports (tmux, Claude)", + }, + } +} + +// Run checks for recent crash reports in macOS diagnostic directories. +func (c *CrashReportCheck) Run(ctx *CheckContext) *CheckResult { + // Only run on macOS + if runtime.GOOS != "darwin" { + return &CheckResult{ + Name: c.Name(), + Status: StatusOK, + Message: "Crash report check not applicable (non-macOS)", + } + } + + // Look for crash reports in the last 24 hours + lookbackWindow := 24 * time.Hour + cutoff := time.Now().Add(-lookbackWindow) + + // macOS crash report locations + homeDir, err := os.UserHomeDir() + if err != nil { + return &CheckResult{ + Name: c.Name(), + Status: StatusWarning, + Message: "Could not determine home directory", + Details: []string{err.Error()}, + } + } + + crashDirs := []string{ + filepath.Join(homeDir, "Library", "Logs", "DiagnosticReports"), + "/Library/Logs/DiagnosticReports", + } + + // Processes we care about + relevantProcesses := []string{ + "tmux", + "claude", + "claude-code", + "node", + } + + var reports []crashReport + + for _, dir := range crashDirs { + entries, err := os.ReadDir(dir) + if err != nil { + continue // Directory may not exist + } + + for _, entry := range entries { + if entry.IsDir() { + continue + } + + name := entry.Name() + + // Check if this is a crash report for a relevant process + var matchedProcess string + nameLower := strings.ToLower(name) + for _, proc := range relevantProcesses { + if strings.Contains(nameLower, proc) { + matchedProcess = proc + break + } + } + + if matchedProcess == "" { + continue + } + + // Check modification time + info, err := entry.Info() + if err != nil { + continue + } + + if info.ModTime().Before(cutoff) { + continue // Too old + } + + reports = append(reports, crashReport{ + path: filepath.Join(dir, name), + name: name, + modTime: info.ModTime(), + process: matchedProcess, + }) + } + } + + // Sort by time (most recent first) + sort.Slice(reports, func(i, j int) bool { + return reports[i].modTime.After(reports[j].modTime) + }) + + // Cache for display + c.crashReports = reports + + if len(reports) == 0 { + return &CheckResult{ + Name: c.Name(), + Status: StatusOK, + Message: "No recent crash reports found", + } + } + + // Group by process + processCounts := make(map[string]int) + for _, r := range reports { + processCounts[r.process]++ + } + + // Build details + var details []string + for _, r := range reports { + age := time.Since(r.modTime).Round(time.Minute) + details = append(details, fmt.Sprintf("%s (%s ago): %s", r.process, age, r.name)) + } + + // Build summary + var summary []string + for proc, count := range processCounts { + summary = append(summary, fmt.Sprintf("%d %s", count, proc)) + } + + message := fmt.Sprintf("Found %d crash report(s): %s", len(reports), strings.Join(summary, ", ")) + + // tmux crashes are particularly concerning + status := StatusWarning + if processCounts["tmux"] > 0 { + message += " - TMUX CRASHED (may explain session deaths)" + } + + return &CheckResult{ + Name: c.Name(), + Status: status, + Message: message, + Details: details, + FixHint: "Review crash reports in Console.app → User Reports or check ~/Library/Logs/DiagnosticReports/", + } +} + +// Fix does nothing - crash reports are informational. +func (c *CrashReportCheck) Fix(ctx *CheckContext) error { + return nil +} + +// CanFix returns false - crash reports cannot be auto-fixed. +func (c *CrashReportCheck) CanFix() bool { + return false +} diff --git a/internal/doctor/orphan_check.go b/internal/doctor/orphan_check.go index be143711..03a2e426 100644 --- a/internal/doctor/orphan_check.go +++ b/internal/doctor/orphan_check.go @@ -8,6 +8,7 @@ import ( "regexp" "strings" + "github.com/steveyegge/gastown/internal/events" "github.com/steveyegge/gastown/internal/session" "github.com/steveyegge/gastown/internal/tmux" ) @@ -115,13 +116,16 @@ func (c *OrphanSessionCheck) Fix(ctx *CheckContext) error { t := tmux.NewTmux() var lastErr error - for _, session := range c.orphanSessions { + for _, sess := range c.orphanSessions { // SAFEGUARD: Never auto-kill crew sessions. // Crew workers are human-managed and require explicit action. - if isCrewSession(session) { + if isCrewSession(sess) { continue } - if err := t.KillSession(session); err != nil { + // Log pre-death event for crash investigation (before killing) + _ = events.LogFeed(events.TypeSessionDeath, sess, + events.SessionDeathPayload(sess, "unknown", "orphan cleanup", "gt doctor")) + if err := t.KillSession(sess); err != nil { lastErr = err } } diff --git a/internal/events/events.go b/internal/events/events.go index 161c90d1..cd3e2357 100644 --- a/internal/events/events.go +++ b/internal/events/events.go @@ -50,6 +50,10 @@ const ( TypeSessionStart = "session_start" TypeSessionEnd = "session_end" + // Session death events (for crash investigation) + TypeSessionDeath = "session_death" // Feed-visible session termination + TypeMassDeath = "mass_death" // Multiple sessions died in short window + // Witness patrol events TypePatrolStarted = "patrol_started" TypePolecatChecked = "polecat_checked" @@ -274,6 +278,37 @@ func HaltPayload(services []string) map[string]interface{} { } } +// SessionDeathPayload creates a payload for session death events. +// session: tmux session name that died +// agent: Gas Town agent identity (e.g., "gastown/polecats/Toast") +// reason: why the session was killed (e.g., "zombie cleanup", "user request", "doctor fix") +// caller: what initiated the kill (e.g., "daemon", "doctor", "gt down") +func SessionDeathPayload(session, agent, reason, caller string) map[string]interface{} { + return map[string]interface{}{ + "session": session, + "agent": agent, + "reason": reason, + "caller": caller, + } +} + +// MassDeathPayload creates a payload for mass death events. +// count: number of sessions that died +// window: time window in which deaths occurred (e.g., "5s") +// sessions: list of session names that died +// possibleCause: suspected cause if known +func MassDeathPayload(count int, window string, sessions []string, possibleCause string) map[string]interface{} { + p := map[string]interface{}{ + "count": count, + "window": window, + "sessions": sessions, + } + if possibleCause != "" { + p["possible_cause"] = possibleCause + } + return p +} + // SessionPayload creates a payload for session start/end events. // sessionID: Claude Code session UUID // role: Gas Town role (e.g., "gastown/crew/joe", "deacon") diff --git a/internal/feed/curator.go b/internal/feed/curator.go index c1fa28e8..bd0f7b04 100644 --- a/internal/feed/curator.go +++ b/internal/feed/curator.go @@ -346,6 +346,28 @@ func (c *Curator) generateSummary(event *events.Event) string { } return "Merge failed" + case events.TypeSessionDeath: + session, _ := event.Payload["session"].(string) + reason, _ := event.Payload["reason"].(string) + if session != "" && reason != "" { + return fmt.Sprintf("Session %s terminated: %s", session, reason) + } + if session != "" { + return fmt.Sprintf("Session %s terminated", session) + } + return "Session terminated" + + case events.TypeMassDeath: + count, _ := event.Payload["count"].(float64) // JSON numbers are float64 + possibleCause, _ := event.Payload["possible_cause"].(string) + if count > 0 && possibleCause != "" { + return fmt.Sprintf("MASS DEATH: %d sessions died - %s", int(count), possibleCause) + } + if count > 0 { + return fmt.Sprintf("MASS DEATH: %d sessions died simultaneously", int(count)) + } + return "Multiple sessions died simultaneously" + default: return fmt.Sprintf("%s: %s", event.Actor, event.Type) } diff --git a/internal/session/town.go b/internal/session/town.go index 20c4ec52..e52380aa 100644 --- a/internal/session/town.go +++ b/internal/session/town.go @@ -6,6 +6,7 @@ import ( "time" "github.com/steveyegge/gastown/internal/boot" + "github.com/steveyegge/gastown/internal/events" "github.com/steveyegge/gastown/internal/tmux" ) @@ -44,6 +45,14 @@ func StopTownSession(t *tmux.Tmux, ts TownSession, force bool) (bool, error) { time.Sleep(100 * time.Millisecond) } + // Log pre-death event for crash investigation (before killing) + reason := "user shutdown" + if force { + reason = "forced shutdown" + } + _ = events.LogFeed(events.TypeSessionDeath, ts.SessionID, + events.SessionDeathPayload(ts.SessionID, ts.Name, reason, "gt down")) + // Kill the session if err := t.KillSession(ts.SessionID); err != nil { return false, fmt.Errorf("killing %s session: %w", ts.Name, err) diff --git a/internal/townlog/logger.go b/internal/townlog/logger.go index 71079434..883edddb 100644 --- a/internal/townlog/logger.go +++ b/internal/townlog/logger.go @@ -36,6 +36,10 @@ const ( EventPolecatNudged EventType = "polecat_nudged" EventEscalationSent EventType = "escalation_sent" EventPatrolComplete EventType = "patrol_complete" + + // Session death events (for crash investigation) + EventSessionDeath EventType = "session_death" // Session terminated (with reason) + EventMassDeath EventType = "mass_death" // Multiple sessions died in short window ) // Event represents a single agent lifecycle event. @@ -188,6 +192,18 @@ func formatLogLine(e Event) string { } else { detail = "patrol complete" } + case EventSessionDeath: + if e.Context != "" { + detail = fmt.Sprintf("session terminated (%s)", e.Context) + } else { + detail = "session terminated" + } + case EventMassDeath: + if e.Context != "" { + detail = fmt.Sprintf("MASS SESSION DEATH (%s)", e.Context) + } else { + detail = "MASS SESSION DEATH" + } default: detail = string(e.Type) if e.Context != "" {