feat(crash): improve crash logging and mass death detection

Add comprehensive crash logging improvements to help diagnose mass session death events:

- Add TypeSessionDeath and TypeMassDeath event types for feed visibility
- Log pre-death events before killing sessions (who killed, why)
- Add mass death detection in daemon (3+ deaths in 30s triggers alert)
- Add macOS crash report check in gt doctor
- Support session death events in townlog and feed curator

Closes hq-kt1o6

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
nux
2026-01-09 13:19:38 -08:00
committed by beads/crew/giles
parent 97b70517cc
commit 692d6819f2
8 changed files with 346 additions and 3 deletions

View File

@@ -11,6 +11,7 @@ import (
"path/filepath"
"strconv"
"strings"
"sync"
"syscall"
"time"
@@ -20,6 +21,7 @@ import (
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/deacon"
"github.com/steveyegge/gastown/internal/events"
"github.com/steveyegge/gastown/internal/feed"
"github.com/steveyegge/gastown/internal/polecat"
"github.com/steveyegge/gastown/internal/refinery"
@@ -41,8 +43,24 @@ type Daemon struct {
ctx context.Context
cancel context.CancelFunc
curator *feed.Curator
// Mass death detection: track recent session deaths
deathsMu sync.Mutex
recentDeaths []sessionDeath
}
// sessionDeath records a detected session death for mass death analysis.
type sessionDeath struct {
sessionName string
timestamp time.Time
}
// Mass death detection parameters
const (
massDeathWindow = 30 * time.Second // Time window to detect mass death
massDeathThreshold = 3 // Number of deaths to trigger alert
)
// New creates a new daemon instance.
func New(config *Config) (*Daemon, error) {
// Ensure daemon directory exists
@@ -735,6 +753,9 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
d.logger.Printf("CRASH DETECTED: polecat %s/%s has hook_bead=%s but session %s is dead",
rigName, polecatName, info.HookBead, sessionName)
// Track this death for mass death detection
d.recordSessionDeath(sessionName)
// Auto-restart the polecat
if err := d.restartPolecatSession(rigName, polecatName, sessionName); err != nil {
d.logger.Printf("Error restarting polecat %s/%s: %v", rigName, polecatName, err)
@@ -745,6 +766,56 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
}
}
// recordSessionDeath records a session death and checks for mass death pattern.
func (d *Daemon) recordSessionDeath(sessionName string) {
d.deathsMu.Lock()
defer d.deathsMu.Unlock()
now := time.Now()
// Add this death
d.recentDeaths = append(d.recentDeaths, sessionDeath{
sessionName: sessionName,
timestamp: now,
})
// Prune deaths outside the window
cutoff := now.Add(-massDeathWindow)
var recent []sessionDeath
for _, death := range d.recentDeaths {
if death.timestamp.After(cutoff) {
recent = append(recent, death)
}
}
d.recentDeaths = recent
// Check for mass death
if len(d.recentDeaths) >= massDeathThreshold {
d.emitMassDeathEvent()
}
}
// emitMassDeathEvent logs a mass death event when multiple sessions die in a short window.
func (d *Daemon) emitMassDeathEvent() {
// Collect session names
var sessions []string
for _, death := range d.recentDeaths {
sessions = append(sessions, death.sessionName)
}
count := len(sessions)
window := massDeathWindow.String()
d.logger.Printf("MASS DEATH DETECTED: %d sessions died in %s: %v", count, window, sessions)
// Emit feed event
_ = events.LogFeed(events.TypeMassDeath, "daemon",
events.MassDeathPayload(count, window, sessions, ""))
// Clear the deaths to avoid repeated alerts
d.recentDeaths = nil
}
// restartPolecatSession restarts a crashed polecat session.
func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string) error {
// Check rig operational state before auto-restarting