From 9ae23a2bca36ac5e1e2cebff9172db897ac4947b Mon Sep 17 00:00:00 2001 From: Steve Yegge Date: Tue, 23 Dec 2025 00:48:10 -0800 Subject: [PATCH] Add crew-state and lifecycle-hygiene doctor checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New checks: - crew-state: Validates crew worker state.json files for completeness Can regenerate missing/invalid state files with --fix - lifecycle-hygiene: Detects stale lifecycle state that can wedge the deacon - Stale lifecycle messages in deacon inbox - Stuck requesting_* flags in state.json when session is healthy Can clean up with --fix (external intervention when deacon is stuck) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal/cmd/doctor.go | 6 + internal/doctor/crew_check.go | 215 ++++++++++++++++++++ internal/doctor/lifecycle_check.go | 302 +++++++++++++++++++++++++++++ 3 files changed, 523 insertions(+) create mode 100644 internal/doctor/crew_check.go create mode 100644 internal/doctor/lifecycle_check.go diff --git a/internal/cmd/doctor.go b/internal/cmd/doctor.go index 4b33f5c8..30cc7542 100644 --- a/internal/cmd/doctor.go +++ b/internal/cmd/doctor.go @@ -76,6 +76,12 @@ func runDoctor(cmd *cobra.Command, args []string) error { d.Register(doctor.NewRuntimeGitignoreCheck()) d.Register(doctor.NewLegacyGastownCheck()) + // Crew workspace checks + d.Register(doctor.NewCrewStateCheck()) + + // Lifecycle hygiene checks + d.Register(doctor.NewLifecycleHygieneCheck()) + // Run checks var report *doctor.Report if doctorFix { diff --git a/internal/doctor/crew_check.go b/internal/doctor/crew_check.go new file mode 100644 index 00000000..11fd18e2 --- /dev/null +++ b/internal/doctor/crew_check.go @@ -0,0 +1,215 @@ +package doctor + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" +) + +// CrewStateCheck validates crew worker state.json files for completeness. +// Empty or incomplete state.json files cause "can't find pane/session" errors. +type CrewStateCheck struct { + FixableCheck + invalidCrews []invalidCrew // Cached during Run for use in Fix +} + +type invalidCrew struct { + path string + stateFile string + rigName string + crewName string + issue string +} + +// NewCrewStateCheck creates a new crew state check. +func NewCrewStateCheck() *CrewStateCheck { + return &CrewStateCheck{ + FixableCheck: FixableCheck{ + BaseCheck: BaseCheck{ + CheckName: "crew-state", + CheckDescription: "Validate crew worker state.json files", + }, + }, + } +} + +// Run checks all crew state.json files for completeness. +func (c *CrewStateCheck) Run(ctx *CheckContext) *CheckResult { + c.invalidCrews = nil + + crewDirs := c.findAllCrewDirs(ctx.TownRoot) + if len(crewDirs) == 0 { + return &CheckResult{ + Name: c.Name(), + Status: StatusOK, + Message: "No crew workspaces found", + } + } + + var validCount int + var details []string + + for _, cd := range crewDirs { + stateFile := filepath.Join(cd.path, "state.json") + + // Check if state.json exists + data, err := os.ReadFile(stateFile) + if err != nil { + if os.IsNotExist(err) { + // Missing state file is OK - code will use defaults + validCount++ + continue + } + // Other errors are problems + issue := fmt.Sprintf("cannot read state.json: %v", err) + c.invalidCrews = append(c.invalidCrews, invalidCrew{ + path: cd.path, + stateFile: stateFile, + rigName: cd.rigName, + crewName: cd.crewName, + issue: issue, + }) + details = append(details, fmt.Sprintf("%s/%s: %s", cd.rigName, cd.crewName, issue)) + continue + } + + // Parse state.json + var state struct { + Name string `json:"name"` + Rig string `json:"rig"` + ClonePath string `json:"clone_path"` + } + if err := json.Unmarshal(data, &state); err != nil { + issue := "invalid JSON in state.json" + c.invalidCrews = append(c.invalidCrews, invalidCrew{ + path: cd.path, + stateFile: stateFile, + rigName: cd.rigName, + crewName: cd.crewName, + issue: issue, + }) + details = append(details, fmt.Sprintf("%s/%s: %s", cd.rigName, cd.crewName, issue)) + continue + } + + // Check for empty/incomplete state + var issues []string + if state.Name == "" { + issues = append(issues, "missing name") + } + if state.Rig == "" { + issues = append(issues, "missing rig") + } + if state.ClonePath == "" { + issues = append(issues, "missing clone_path") + } + + if len(issues) > 0 { + issue := strings.Join(issues, ", ") + c.invalidCrews = append(c.invalidCrews, invalidCrew{ + path: cd.path, + stateFile: stateFile, + rigName: cd.rigName, + crewName: cd.crewName, + issue: issue, + }) + details = append(details, fmt.Sprintf("%s/%s: %s", cd.rigName, cd.crewName, issue)) + } else { + validCount++ + } + } + + if len(c.invalidCrews) == 0 { + return &CheckResult{ + Name: c.Name(), + Status: StatusOK, + Message: fmt.Sprintf("All %d crew state files valid", validCount), + } + } + + return &CheckResult{ + Name: c.Name(), + Status: StatusWarning, + Message: fmt.Sprintf("%d crew workspace(s) with invalid state.json", len(c.invalidCrews)), + Details: details, + FixHint: "Run 'gt doctor --fix' to regenerate state files", + } +} + +// Fix regenerates invalid state.json files with correct values. +func (c *CrewStateCheck) Fix(ctx *CheckContext) error { + if len(c.invalidCrews) == 0 { + return nil + } + + var lastErr error + for _, ic := range c.invalidCrews { + state := map[string]interface{}{ + "name": ic.crewName, + "rig": ic.rigName, + "clone_path": ic.path, + "branch": "main", + "created_at": time.Now().Format(time.RFC3339), + "updated_at": time.Now().Format(time.RFC3339), + } + + data, err := json.MarshalIndent(state, "", " ") + if err != nil { + lastErr = fmt.Errorf("%s/%s: %w", ic.rigName, ic.crewName, err) + continue + } + + if err := os.WriteFile(ic.stateFile, data, 0644); err != nil { + lastErr = fmt.Errorf("%s/%s: %w", ic.rigName, ic.crewName, err) + continue + } + } + + return lastErr +} + +type crewDir struct { + path string + rigName string + crewName string +} + +// findAllCrewDirs finds all crew directories in the workspace. +func (c *CrewStateCheck) findAllCrewDirs(townRoot string) []crewDir { + var dirs []crewDir + + entries, err := os.ReadDir(townRoot) + if err != nil { + return dirs + } + + for _, entry := range entries { + if !entry.IsDir() || strings.HasPrefix(entry.Name(), ".") || entry.Name() == "mayor" { + continue + } + + rigName := entry.Name() + crewPath := filepath.Join(townRoot, rigName, "crew") + + crewEntries, err := os.ReadDir(crewPath) + if err != nil { + continue + } + + for _, crew := range crewEntries { + if !crew.IsDir() || strings.HasPrefix(crew.Name(), ".") { + continue + } + dirs = append(dirs, crewDir{ + path: filepath.Join(crewPath, crew.Name()), + rigName: rigName, + crewName: crew.Name(), + }) + } + } + + return dirs +} diff --git a/internal/doctor/lifecycle_check.go b/internal/doctor/lifecycle_check.go new file mode 100644 index 00000000..e3de64c1 --- /dev/null +++ b/internal/doctor/lifecycle_check.go @@ -0,0 +1,302 @@ +package doctor + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" +) + +// LifecycleHygieneCheck detects and cleans up stale lifecycle state. +// This can happen when: +// - Lifecycle messages weren't properly deleted after processing +// - Agent state.json has stuck requesting_* flags +// - Session was manually killed without clearing state +type LifecycleHygieneCheck struct { + FixableCheck + staleMessages []staleMessage + stuckStateFiles []stuckState +} + +type staleMessage struct { + ID string + Subject string + From string +} + +type stuckState struct { + stateFile string + identity string + flag string +} + +// NewLifecycleHygieneCheck creates a new lifecycle hygiene check. +func NewLifecycleHygieneCheck() *LifecycleHygieneCheck { + return &LifecycleHygieneCheck{ + FixableCheck: FixableCheck{ + BaseCheck: BaseCheck{ + CheckName: "lifecycle-hygiene", + CheckDescription: "Check for stale lifecycle messages and stuck state flags", + }, + }, + } +} + +// Run checks for stale lifecycle state. +func (c *LifecycleHygieneCheck) Run(ctx *CheckContext) *CheckResult { + c.staleMessages = nil + c.stuckStateFiles = nil + + var details []string + + // Check for stale lifecycle messages in deacon inbox + staleCount := c.checkDeaconInbox(ctx) + if staleCount > 0 { + details = append(details, fmt.Sprintf("%d stale lifecycle message(s) in deacon inbox", staleCount)) + } + + // Check for stuck requesting_* flags in state files + stuckCount := c.checkStateFiles(ctx) + if stuckCount > 0 { + details = append(details, fmt.Sprintf("%d agent(s) with stuck requesting_* flags", stuckCount)) + } + + total := staleCount + stuckCount + if total == 0 { + return &CheckResult{ + Name: c.Name(), + Status: StatusOK, + Message: "No stale lifecycle state found", + } + } + + return &CheckResult{ + Name: c.Name(), + Status: StatusWarning, + Message: fmt.Sprintf("Found %d lifecycle hygiene issue(s)", total), + Details: details, + FixHint: "Run 'gt doctor --fix' to clean up", + } +} + +// checkDeaconInbox looks for stale lifecycle messages. +func (c *LifecycleHygieneCheck) checkDeaconInbox(ctx *CheckContext) int { + // Get deacon inbox via gt mail + cmd := exec.Command("gt", "mail", "inbox", "--identity", "deacon/", "--json") + cmd.Dir = ctx.TownRoot + + output, err := cmd.Output() + if err != nil { + return 0 // Can't check, assume OK + } + + if len(output) == 0 || string(output) == "[]" || string(output) == "[]\n" { + return 0 + } + + var messages []struct { + ID string `json:"id"` + From string `json:"from"` + Subject string `json:"subject"` + } + if err := json.Unmarshal(output, &messages); err != nil { + return 0 + } + + // Look for lifecycle messages + for _, msg := range messages { + if strings.HasPrefix(strings.ToLower(msg.Subject), "lifecycle:") { + c.staleMessages = append(c.staleMessages, staleMessage{ + ID: msg.ID, + Subject: msg.Subject, + From: msg.From, + }) + } + } + + return len(c.staleMessages) +} + +// checkStateFiles looks for stuck requesting_* flags in state.json files. +func (c *LifecycleHygieneCheck) checkStateFiles(ctx *CheckContext) int { + stateFiles := c.findStateFiles(ctx.TownRoot) + + for _, sf := range stateFiles { + data, err := os.ReadFile(sf.path) + if err != nil { + continue + } + + var state map[string]interface{} + if err := json.Unmarshal(data, &state); err != nil { + continue + } + + // Check for any requesting_* flags + for key, val := range state { + if strings.HasPrefix(key, "requesting_") { + if boolVal, ok := val.(bool); ok && boolVal { + // Found a stuck flag - verify session is actually healthy + if c.isSessionHealthy(sf.identity) { + c.stuckStateFiles = append(c.stuckStateFiles, stuckState{ + stateFile: sf.path, + identity: sf.identity, + flag: key, + }) + } + } + } + } + } + + return len(c.stuckStateFiles) +} + +type stateFileInfo struct { + path string + identity string +} + +// findStateFiles locates all state.json files for agents. +func (c *LifecycleHygieneCheck) findStateFiles(townRoot string) []stateFileInfo { + var files []stateFileInfo + + // Mayor state + mayorState := filepath.Join(townRoot, "mayor", "state.json") + if _, err := os.Stat(mayorState); err == nil { + files = append(files, stateFileInfo{path: mayorState, identity: "mayor"}) + } + + // Scan rigs for witness, refinery, and crew state files + entries, err := os.ReadDir(townRoot) + if err != nil { + return files + } + + for _, entry := range entries { + if !entry.IsDir() || strings.HasPrefix(entry.Name(), ".") || entry.Name() == "mayor" { + continue + } + + rigName := entry.Name() + rigPath := filepath.Join(townRoot, rigName) + + // Witness state + witnessState := filepath.Join(rigPath, "witness", "state.json") + if _, err := os.Stat(witnessState); err == nil { + files = append(files, stateFileInfo{ + path: witnessState, + identity: rigName + "-witness", + }) + } + + // Refinery state + refineryState := filepath.Join(rigPath, "refinery", "state.json") + if _, err := os.Stat(refineryState); err == nil { + files = append(files, stateFileInfo{ + path: refineryState, + identity: rigName + "-refinery", + }) + } + + // Crew state files + crewPath := filepath.Join(rigPath, "crew") + crewEntries, err := os.ReadDir(crewPath) + if err != nil { + continue + } + for _, crew := range crewEntries { + if !crew.IsDir() || strings.HasPrefix(crew.Name(), ".") { + continue + } + crewState := filepath.Join(crewPath, crew.Name(), "state.json") + if _, err := os.Stat(crewState); err == nil { + files = append(files, stateFileInfo{ + path: crewState, + identity: rigName + "-crew-" + crew.Name(), + }) + } + } + } + + return files +} + +// isSessionHealthy checks if the tmux session for this identity exists and is running. +func (c *LifecycleHygieneCheck) isSessionHealthy(identity string) bool { + sessionName := identityToSessionName(identity) + if sessionName == "" { + return false + } + + // Check if session exists + cmd := exec.Command("tmux", "has-session", "-t", sessionName) + return cmd.Run() == nil +} + +// identityToSessionName converts an identity to its tmux session name. +func identityToSessionName(identity string) string { + switch identity { + case "mayor": + return "gt-mayor" + default: + if strings.HasSuffix(identity, "-witness") || + strings.HasSuffix(identity, "-refinery") || + strings.Contains(identity, "-crew-") { + return "gt-" + identity + } + return "" + } +} + +// Fix cleans up stale lifecycle state. +func (c *LifecycleHygieneCheck) Fix(ctx *CheckContext) error { + var errors []string + + // Delete stale lifecycle messages + for _, msg := range c.staleMessages { + cmd := exec.Command("gt", "mail", "delete", msg.ID) + cmd.Dir = ctx.TownRoot + if err := cmd.Run(); err != nil { + errors = append(errors, fmt.Sprintf("failed to delete message %s: %v", msg.ID, err)) + } + } + + // Clear stuck requesting_* flags + for _, stuck := range c.stuckStateFiles { + if err := c.clearRequestingFlag(stuck); err != nil { + errors = append(errors, fmt.Sprintf("failed to clear %s in %s: %v", stuck.flag, stuck.identity, err)) + } + } + + if len(errors) > 0 { + return fmt.Errorf("%s", strings.Join(errors, "; ")) + } + return nil +} + +// clearRequestingFlag removes the stuck requesting_* flag from a state file. +func (c *LifecycleHygieneCheck) clearRequestingFlag(stuck stuckState) error { + data, err := os.ReadFile(stuck.stateFile) + if err != nil { + return err + } + + var state map[string]interface{} + if err := json.Unmarshal(data, &state); err != nil { + return err + } + + // Remove the requesting flag and any associated timestamp + delete(state, stuck.flag) + delete(state, "requesting_time") + + newData, err := json.MarshalIndent(state, "", " ") + if err != nil { + return err + } + + return os.WriteFile(stuck.stateFile, newData, 0644) +}