feat(crash): improve crash logging and mass death detection
Add comprehensive crash logging improvements to help diagnose mass session death events: - Add TypeSessionDeath and TypeMassDeath event types for feed visibility - Log pre-death events before killing sessions (who killed, why) - Add mass death detection in daemon (3+ deaths in 30s triggers alert) - Add macOS crash report check in gt doctor - Support session death events in townlog and feed curator Closes hq-kt1o6 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -132,6 +132,7 @@ func runDoctor(cmd *cobra.Command, args []string) error {
|
||||
d.Register(doctor.NewIdentityCollisionCheck())
|
||||
d.Register(doctor.NewLinkedPaneCheck())
|
||||
d.Register(doctor.NewThemeCheck())
|
||||
d.Register(doctor.NewCrashReportCheck())
|
||||
|
||||
// Patrol system checks
|
||||
d.Register(doctor.NewPatrolMoleculesExistCheck())
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
@@ -20,6 +21,7 @@ import (
|
||||
"github.com/steveyegge/gastown/internal/config"
|
||||
"github.com/steveyegge/gastown/internal/constants"
|
||||
"github.com/steveyegge/gastown/internal/deacon"
|
||||
"github.com/steveyegge/gastown/internal/events"
|
||||
"github.com/steveyegge/gastown/internal/feed"
|
||||
"github.com/steveyegge/gastown/internal/polecat"
|
||||
"github.com/steveyegge/gastown/internal/refinery"
|
||||
@@ -41,8 +43,24 @@ type Daemon struct {
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
curator *feed.Curator
|
||||
|
||||
// Mass death detection: track recent session deaths
|
||||
deathsMu sync.Mutex
|
||||
recentDeaths []sessionDeath
|
||||
}
|
||||
|
||||
// sessionDeath records a detected session death for mass death analysis.
|
||||
type sessionDeath struct {
|
||||
sessionName string
|
||||
timestamp time.Time
|
||||
}
|
||||
|
||||
// Mass death detection parameters
|
||||
const (
|
||||
massDeathWindow = 30 * time.Second // Time window to detect mass death
|
||||
massDeathThreshold = 3 // Number of deaths to trigger alert
|
||||
)
|
||||
|
||||
// New creates a new daemon instance.
|
||||
func New(config *Config) (*Daemon, error) {
|
||||
// Ensure daemon directory exists
|
||||
@@ -735,6 +753,9 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
|
||||
d.logger.Printf("CRASH DETECTED: polecat %s/%s has hook_bead=%s but session %s is dead",
|
||||
rigName, polecatName, info.HookBead, sessionName)
|
||||
|
||||
// Track this death for mass death detection
|
||||
d.recordSessionDeath(sessionName)
|
||||
|
||||
// Auto-restart the polecat
|
||||
if err := d.restartPolecatSession(rigName, polecatName, sessionName); err != nil {
|
||||
d.logger.Printf("Error restarting polecat %s/%s: %v", rigName, polecatName, err)
|
||||
@@ -745,6 +766,56 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
|
||||
}
|
||||
}
|
||||
|
||||
// recordSessionDeath records a session death and checks for mass death pattern.
|
||||
func (d *Daemon) recordSessionDeath(sessionName string) {
|
||||
d.deathsMu.Lock()
|
||||
defer d.deathsMu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
|
||||
// Add this death
|
||||
d.recentDeaths = append(d.recentDeaths, sessionDeath{
|
||||
sessionName: sessionName,
|
||||
timestamp: now,
|
||||
})
|
||||
|
||||
// Prune deaths outside the window
|
||||
cutoff := now.Add(-massDeathWindow)
|
||||
var recent []sessionDeath
|
||||
for _, death := range d.recentDeaths {
|
||||
if death.timestamp.After(cutoff) {
|
||||
recent = append(recent, death)
|
||||
}
|
||||
}
|
||||
d.recentDeaths = recent
|
||||
|
||||
// Check for mass death
|
||||
if len(d.recentDeaths) >= massDeathThreshold {
|
||||
d.emitMassDeathEvent()
|
||||
}
|
||||
}
|
||||
|
||||
// emitMassDeathEvent logs a mass death event when multiple sessions die in a short window.
|
||||
func (d *Daemon) emitMassDeathEvent() {
|
||||
// Collect session names
|
||||
var sessions []string
|
||||
for _, death := range d.recentDeaths {
|
||||
sessions = append(sessions, death.sessionName)
|
||||
}
|
||||
|
||||
count := len(sessions)
|
||||
window := massDeathWindow.String()
|
||||
|
||||
d.logger.Printf("MASS DEATH DETECTED: %d sessions died in %s: %v", count, window, sessions)
|
||||
|
||||
// Emit feed event
|
||||
_ = events.LogFeed(events.TypeMassDeath, "daemon",
|
||||
events.MassDeathPayload(count, window, sessions, ""))
|
||||
|
||||
// Clear the deaths to avoid repeated alerts
|
||||
d.recentDeaths = nil
|
||||
}
|
||||
|
||||
// restartPolecatSession restarts a crashed polecat session.
|
||||
func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string) error {
|
||||
// Check rig operational state before auto-restarting
|
||||
|
||||
185
internal/doctor/crash_report_check.go
Normal file
185
internal/doctor/crash_report_check.go
Normal file
@@ -0,0 +1,185 @@
|
||||
package doctor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CrashReportCheck looks for recent macOS crash reports related to tmux or Claude.
|
||||
// This helps diagnose mass session death events.
|
||||
type CrashReportCheck struct {
|
||||
BaseCheck
|
||||
crashReports []crashReport // Cached during Run for display
|
||||
}
|
||||
|
||||
// crashReport represents a found crash report file.
|
||||
type crashReport struct {
|
||||
path string
|
||||
name string
|
||||
modTime time.Time
|
||||
process string // "tmux", "claude", "node", etc.
|
||||
}
|
||||
|
||||
// NewCrashReportCheck creates a new crash report check.
|
||||
func NewCrashReportCheck() *CrashReportCheck {
|
||||
return &CrashReportCheck{
|
||||
BaseCheck: BaseCheck{
|
||||
CheckName: "crash-reports",
|
||||
CheckDescription: "Check for recent macOS crash reports (tmux, Claude)",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Run checks for recent crash reports in macOS diagnostic directories.
|
||||
func (c *CrashReportCheck) Run(ctx *CheckContext) *CheckResult {
|
||||
// Only run on macOS
|
||||
if runtime.GOOS != "darwin" {
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusOK,
|
||||
Message: "Crash report check not applicable (non-macOS)",
|
||||
}
|
||||
}
|
||||
|
||||
// Look for crash reports in the last 24 hours
|
||||
lookbackWindow := 24 * time.Hour
|
||||
cutoff := time.Now().Add(-lookbackWindow)
|
||||
|
||||
// macOS crash report locations
|
||||
homeDir, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusWarning,
|
||||
Message: "Could not determine home directory",
|
||||
Details: []string{err.Error()},
|
||||
}
|
||||
}
|
||||
|
||||
crashDirs := []string{
|
||||
filepath.Join(homeDir, "Library", "Logs", "DiagnosticReports"),
|
||||
"/Library/Logs/DiagnosticReports",
|
||||
}
|
||||
|
||||
// Processes we care about
|
||||
relevantProcesses := []string{
|
||||
"tmux",
|
||||
"claude",
|
||||
"claude-code",
|
||||
"node",
|
||||
}
|
||||
|
||||
var reports []crashReport
|
||||
|
||||
for _, dir := range crashDirs {
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
continue // Directory may not exist
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if entry.IsDir() {
|
||||
continue
|
||||
}
|
||||
|
||||
name := entry.Name()
|
||||
|
||||
// Check if this is a crash report for a relevant process
|
||||
var matchedProcess string
|
||||
nameLower := strings.ToLower(name)
|
||||
for _, proc := range relevantProcesses {
|
||||
if strings.Contains(nameLower, proc) {
|
||||
matchedProcess = proc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if matchedProcess == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check modification time
|
||||
info, err := entry.Info()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if info.ModTime().Before(cutoff) {
|
||||
continue // Too old
|
||||
}
|
||||
|
||||
reports = append(reports, crashReport{
|
||||
path: filepath.Join(dir, name),
|
||||
name: name,
|
||||
modTime: info.ModTime(),
|
||||
process: matchedProcess,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by time (most recent first)
|
||||
sort.Slice(reports, func(i, j int) bool {
|
||||
return reports[i].modTime.After(reports[j].modTime)
|
||||
})
|
||||
|
||||
// Cache for display
|
||||
c.crashReports = reports
|
||||
|
||||
if len(reports) == 0 {
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusOK,
|
||||
Message: "No recent crash reports found",
|
||||
}
|
||||
}
|
||||
|
||||
// Group by process
|
||||
processCounts := make(map[string]int)
|
||||
for _, r := range reports {
|
||||
processCounts[r.process]++
|
||||
}
|
||||
|
||||
// Build details
|
||||
var details []string
|
||||
for _, r := range reports {
|
||||
age := time.Since(r.modTime).Round(time.Minute)
|
||||
details = append(details, fmt.Sprintf("%s (%s ago): %s", r.process, age, r.name))
|
||||
}
|
||||
|
||||
// Build summary
|
||||
var summary []string
|
||||
for proc, count := range processCounts {
|
||||
summary = append(summary, fmt.Sprintf("%d %s", count, proc))
|
||||
}
|
||||
|
||||
message := fmt.Sprintf("Found %d crash report(s): %s", len(reports), strings.Join(summary, ", "))
|
||||
|
||||
// tmux crashes are particularly concerning
|
||||
status := StatusWarning
|
||||
if processCounts["tmux"] > 0 {
|
||||
message += " - TMUX CRASHED (may explain session deaths)"
|
||||
}
|
||||
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: status,
|
||||
Message: message,
|
||||
Details: details,
|
||||
FixHint: "Review crash reports in Console.app → User Reports or check ~/Library/Logs/DiagnosticReports/",
|
||||
}
|
||||
}
|
||||
|
||||
// Fix does nothing - crash reports are informational.
|
||||
func (c *CrashReportCheck) Fix(ctx *CheckContext) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// CanFix returns false - crash reports cannot be auto-fixed.
|
||||
func (c *CrashReportCheck) CanFix() bool {
|
||||
return false
|
||||
}
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/steveyegge/gastown/internal/events"
|
||||
"github.com/steveyegge/gastown/internal/session"
|
||||
"github.com/steveyegge/gastown/internal/tmux"
|
||||
)
|
||||
@@ -115,13 +116,16 @@ func (c *OrphanSessionCheck) Fix(ctx *CheckContext) error {
|
||||
t := tmux.NewTmux()
|
||||
var lastErr error
|
||||
|
||||
for _, session := range c.orphanSessions {
|
||||
for _, sess := range c.orphanSessions {
|
||||
// SAFEGUARD: Never auto-kill crew sessions.
|
||||
// Crew workers are human-managed and require explicit action.
|
||||
if isCrewSession(session) {
|
||||
if isCrewSession(sess) {
|
||||
continue
|
||||
}
|
||||
if err := t.KillSession(session); err != nil {
|
||||
// Log pre-death event for crash investigation (before killing)
|
||||
_ = events.LogFeed(events.TypeSessionDeath, sess,
|
||||
events.SessionDeathPayload(sess, "unknown", "orphan cleanup", "gt doctor"))
|
||||
if err := t.KillSession(sess); err != nil {
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,6 +50,10 @@ const (
|
||||
TypeSessionStart = "session_start"
|
||||
TypeSessionEnd = "session_end"
|
||||
|
||||
// Session death events (for crash investigation)
|
||||
TypeSessionDeath = "session_death" // Feed-visible session termination
|
||||
TypeMassDeath = "mass_death" // Multiple sessions died in short window
|
||||
|
||||
// Witness patrol events
|
||||
TypePatrolStarted = "patrol_started"
|
||||
TypePolecatChecked = "polecat_checked"
|
||||
@@ -274,6 +278,37 @@ func HaltPayload(services []string) map[string]interface{} {
|
||||
}
|
||||
}
|
||||
|
||||
// SessionDeathPayload creates a payload for session death events.
|
||||
// session: tmux session name that died
|
||||
// agent: Gas Town agent identity (e.g., "gastown/polecats/Toast")
|
||||
// reason: why the session was killed (e.g., "zombie cleanup", "user request", "doctor fix")
|
||||
// caller: what initiated the kill (e.g., "daemon", "doctor", "gt down")
|
||||
func SessionDeathPayload(session, agent, reason, caller string) map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"session": session,
|
||||
"agent": agent,
|
||||
"reason": reason,
|
||||
"caller": caller,
|
||||
}
|
||||
}
|
||||
|
||||
// MassDeathPayload creates a payload for mass death events.
|
||||
// count: number of sessions that died
|
||||
// window: time window in which deaths occurred (e.g., "5s")
|
||||
// sessions: list of session names that died
|
||||
// possibleCause: suspected cause if known
|
||||
func MassDeathPayload(count int, window string, sessions []string, possibleCause string) map[string]interface{} {
|
||||
p := map[string]interface{}{
|
||||
"count": count,
|
||||
"window": window,
|
||||
"sessions": sessions,
|
||||
}
|
||||
if possibleCause != "" {
|
||||
p["possible_cause"] = possibleCause
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// SessionPayload creates a payload for session start/end events.
|
||||
// sessionID: Claude Code session UUID
|
||||
// role: Gas Town role (e.g., "gastown/crew/joe", "deacon")
|
||||
|
||||
@@ -346,6 +346,28 @@ func (c *Curator) generateSummary(event *events.Event) string {
|
||||
}
|
||||
return "Merge failed"
|
||||
|
||||
case events.TypeSessionDeath:
|
||||
session, _ := event.Payload["session"].(string)
|
||||
reason, _ := event.Payload["reason"].(string)
|
||||
if session != "" && reason != "" {
|
||||
return fmt.Sprintf("Session %s terminated: %s", session, reason)
|
||||
}
|
||||
if session != "" {
|
||||
return fmt.Sprintf("Session %s terminated", session)
|
||||
}
|
||||
return "Session terminated"
|
||||
|
||||
case events.TypeMassDeath:
|
||||
count, _ := event.Payload["count"].(float64) // JSON numbers are float64
|
||||
possibleCause, _ := event.Payload["possible_cause"].(string)
|
||||
if count > 0 && possibleCause != "" {
|
||||
return fmt.Sprintf("MASS DEATH: %d sessions died - %s", int(count), possibleCause)
|
||||
}
|
||||
if count > 0 {
|
||||
return fmt.Sprintf("MASS DEATH: %d sessions died simultaneously", int(count))
|
||||
}
|
||||
return "Multiple sessions died simultaneously"
|
||||
|
||||
default:
|
||||
return fmt.Sprintf("%s: %s", event.Actor, event.Type)
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/steveyegge/gastown/internal/boot"
|
||||
"github.com/steveyegge/gastown/internal/events"
|
||||
"github.com/steveyegge/gastown/internal/tmux"
|
||||
)
|
||||
|
||||
@@ -44,6 +45,14 @@ func StopTownSession(t *tmux.Tmux, ts TownSession, force bool) (bool, error) {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
|
||||
// Log pre-death event for crash investigation (before killing)
|
||||
reason := "user shutdown"
|
||||
if force {
|
||||
reason = "forced shutdown"
|
||||
}
|
||||
_ = events.LogFeed(events.TypeSessionDeath, ts.SessionID,
|
||||
events.SessionDeathPayload(ts.SessionID, ts.Name, reason, "gt down"))
|
||||
|
||||
// Kill the session
|
||||
if err := t.KillSession(ts.SessionID); err != nil {
|
||||
return false, fmt.Errorf("killing %s session: %w", ts.Name, err)
|
||||
|
||||
@@ -36,6 +36,10 @@ const (
|
||||
EventPolecatNudged EventType = "polecat_nudged"
|
||||
EventEscalationSent EventType = "escalation_sent"
|
||||
EventPatrolComplete EventType = "patrol_complete"
|
||||
|
||||
// Session death events (for crash investigation)
|
||||
EventSessionDeath EventType = "session_death" // Session terminated (with reason)
|
||||
EventMassDeath EventType = "mass_death" // Multiple sessions died in short window
|
||||
)
|
||||
|
||||
// Event represents a single agent lifecycle event.
|
||||
@@ -188,6 +192,18 @@ func formatLogLine(e Event) string {
|
||||
} else {
|
||||
detail = "patrol complete"
|
||||
}
|
||||
case EventSessionDeath:
|
||||
if e.Context != "" {
|
||||
detail = fmt.Sprintf("session terminated (%s)", e.Context)
|
||||
} else {
|
||||
detail = "session terminated"
|
||||
}
|
||||
case EventMassDeath:
|
||||
if e.Context != "" {
|
||||
detail = fmt.Sprintf("MASS SESSION DEATH (%s)", e.Context)
|
||||
} else {
|
||||
detail = "MASS SESSION DEATH"
|
||||
}
|
||||
default:
|
||||
detail = string(e.Type)
|
||||
if e.Context != "" {
|
||||
|
||||
Reference in New Issue
Block a user