feat(crash): improve crash logging and mass death detection
Add comprehensive crash logging improvements to help diagnose mass session death events: - Add TypeSessionDeath and TypeMassDeath event types for feed visibility - Log pre-death events before killing sessions (who killed, why) - Add mass death detection in daemon (3+ deaths in 30s triggers alert) - Add macOS crash report check in gt doctor - Support session death events in townlog and feed curator Closes hq-kt1o6 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -11,6 +11,7 @@ import (
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
@@ -20,6 +21,7 @@ import (
|
||||
"github.com/steveyegge/gastown/internal/config"
|
||||
"github.com/steveyegge/gastown/internal/constants"
|
||||
"github.com/steveyegge/gastown/internal/deacon"
|
||||
"github.com/steveyegge/gastown/internal/events"
|
||||
"github.com/steveyegge/gastown/internal/feed"
|
||||
"github.com/steveyegge/gastown/internal/polecat"
|
||||
"github.com/steveyegge/gastown/internal/refinery"
|
||||
@@ -41,8 +43,24 @@ type Daemon struct {
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
curator *feed.Curator
|
||||
|
||||
// Mass death detection: track recent session deaths
|
||||
deathsMu sync.Mutex
|
||||
recentDeaths []sessionDeath
|
||||
}
|
||||
|
||||
// sessionDeath records a detected session death for mass death analysis.
|
||||
type sessionDeath struct {
|
||||
sessionName string
|
||||
timestamp time.Time
|
||||
}
|
||||
|
||||
// Mass death detection parameters
|
||||
const (
|
||||
massDeathWindow = 30 * time.Second // Time window to detect mass death
|
||||
massDeathThreshold = 3 // Number of deaths to trigger alert
|
||||
)
|
||||
|
||||
// New creates a new daemon instance.
|
||||
func New(config *Config) (*Daemon, error) {
|
||||
// Ensure daemon directory exists
|
||||
@@ -735,6 +753,9 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
|
||||
d.logger.Printf("CRASH DETECTED: polecat %s/%s has hook_bead=%s but session %s is dead",
|
||||
rigName, polecatName, info.HookBead, sessionName)
|
||||
|
||||
// Track this death for mass death detection
|
||||
d.recordSessionDeath(sessionName)
|
||||
|
||||
// Auto-restart the polecat
|
||||
if err := d.restartPolecatSession(rigName, polecatName, sessionName); err != nil {
|
||||
d.logger.Printf("Error restarting polecat %s/%s: %v", rigName, polecatName, err)
|
||||
@@ -745,6 +766,56 @@ func (d *Daemon) checkPolecatHealth(rigName, polecatName string) {
|
||||
}
|
||||
}
|
||||
|
||||
// recordSessionDeath records a session death and checks for mass death pattern.
|
||||
func (d *Daemon) recordSessionDeath(sessionName string) {
|
||||
d.deathsMu.Lock()
|
||||
defer d.deathsMu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
|
||||
// Add this death
|
||||
d.recentDeaths = append(d.recentDeaths, sessionDeath{
|
||||
sessionName: sessionName,
|
||||
timestamp: now,
|
||||
})
|
||||
|
||||
// Prune deaths outside the window
|
||||
cutoff := now.Add(-massDeathWindow)
|
||||
var recent []sessionDeath
|
||||
for _, death := range d.recentDeaths {
|
||||
if death.timestamp.After(cutoff) {
|
||||
recent = append(recent, death)
|
||||
}
|
||||
}
|
||||
d.recentDeaths = recent
|
||||
|
||||
// Check for mass death
|
||||
if len(d.recentDeaths) >= massDeathThreshold {
|
||||
d.emitMassDeathEvent()
|
||||
}
|
||||
}
|
||||
|
||||
// emitMassDeathEvent logs a mass death event when multiple sessions die in a short window.
|
||||
func (d *Daemon) emitMassDeathEvent() {
|
||||
// Collect session names
|
||||
var sessions []string
|
||||
for _, death := range d.recentDeaths {
|
||||
sessions = append(sessions, death.sessionName)
|
||||
}
|
||||
|
||||
count := len(sessions)
|
||||
window := massDeathWindow.String()
|
||||
|
||||
d.logger.Printf("MASS DEATH DETECTED: %d sessions died in %s: %v", count, window, sessions)
|
||||
|
||||
// Emit feed event
|
||||
_ = events.LogFeed(events.TypeMassDeath, "daemon",
|
||||
events.MassDeathPayload(count, window, sessions, ""))
|
||||
|
||||
// Clear the deaths to avoid repeated alerts
|
||||
d.recentDeaths = nil
|
||||
}
|
||||
|
||||
// restartPolecatSession restarts a crashed polecat session.
|
||||
func (d *Daemon) restartPolecatSession(rigName, polecatName, sessionName string) error {
|
||||
// Check rig operational state before auto-restarting
|
||||
|
||||
Reference in New Issue
Block a user