Add crew-state and lifecycle-hygiene doctor checks
New checks: - crew-state: Validates crew worker state.json files for completeness Can regenerate missing/invalid state files with --fix - lifecycle-hygiene: Detects stale lifecycle state that can wedge the deacon - Stale lifecycle messages in deacon inbox - Stuck requesting_* flags in state.json when session is healthy Can clean up with --fix (external intervention when deacon is stuck) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -76,6 +76,12 @@ func runDoctor(cmd *cobra.Command, args []string) error {
|
||||
d.Register(doctor.NewRuntimeGitignoreCheck())
|
||||
d.Register(doctor.NewLegacyGastownCheck())
|
||||
|
||||
// Crew workspace checks
|
||||
d.Register(doctor.NewCrewStateCheck())
|
||||
|
||||
// Lifecycle hygiene checks
|
||||
d.Register(doctor.NewLifecycleHygieneCheck())
|
||||
|
||||
// Run checks
|
||||
var report *doctor.Report
|
||||
if doctorFix {
|
||||
|
||||
215
internal/doctor/crew_check.go
Normal file
215
internal/doctor/crew_check.go
Normal file
@@ -0,0 +1,215 @@
|
||||
package doctor
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CrewStateCheck validates crew worker state.json files for completeness.
|
||||
// Empty or incomplete state.json files cause "can't find pane/session" errors.
|
||||
type CrewStateCheck struct {
|
||||
FixableCheck
|
||||
invalidCrews []invalidCrew // Cached during Run for use in Fix
|
||||
}
|
||||
|
||||
type invalidCrew struct {
|
||||
path string
|
||||
stateFile string
|
||||
rigName string
|
||||
crewName string
|
||||
issue string
|
||||
}
|
||||
|
||||
// NewCrewStateCheck creates a new crew state check.
|
||||
func NewCrewStateCheck() *CrewStateCheck {
|
||||
return &CrewStateCheck{
|
||||
FixableCheck: FixableCheck{
|
||||
BaseCheck: BaseCheck{
|
||||
CheckName: "crew-state",
|
||||
CheckDescription: "Validate crew worker state.json files",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Run checks all crew state.json files for completeness.
|
||||
func (c *CrewStateCheck) Run(ctx *CheckContext) *CheckResult {
|
||||
c.invalidCrews = nil
|
||||
|
||||
crewDirs := c.findAllCrewDirs(ctx.TownRoot)
|
||||
if len(crewDirs) == 0 {
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusOK,
|
||||
Message: "No crew workspaces found",
|
||||
}
|
||||
}
|
||||
|
||||
var validCount int
|
||||
var details []string
|
||||
|
||||
for _, cd := range crewDirs {
|
||||
stateFile := filepath.Join(cd.path, "state.json")
|
||||
|
||||
// Check if state.json exists
|
||||
data, err := os.ReadFile(stateFile)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
// Missing state file is OK - code will use defaults
|
||||
validCount++
|
||||
continue
|
||||
}
|
||||
// Other errors are problems
|
||||
issue := fmt.Sprintf("cannot read state.json: %v", err)
|
||||
c.invalidCrews = append(c.invalidCrews, invalidCrew{
|
||||
path: cd.path,
|
||||
stateFile: stateFile,
|
||||
rigName: cd.rigName,
|
||||
crewName: cd.crewName,
|
||||
issue: issue,
|
||||
})
|
||||
details = append(details, fmt.Sprintf("%s/%s: %s", cd.rigName, cd.crewName, issue))
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse state.json
|
||||
var state struct {
|
||||
Name string `json:"name"`
|
||||
Rig string `json:"rig"`
|
||||
ClonePath string `json:"clone_path"`
|
||||
}
|
||||
if err := json.Unmarshal(data, &state); err != nil {
|
||||
issue := "invalid JSON in state.json"
|
||||
c.invalidCrews = append(c.invalidCrews, invalidCrew{
|
||||
path: cd.path,
|
||||
stateFile: stateFile,
|
||||
rigName: cd.rigName,
|
||||
crewName: cd.crewName,
|
||||
issue: issue,
|
||||
})
|
||||
details = append(details, fmt.Sprintf("%s/%s: %s", cd.rigName, cd.crewName, issue))
|
||||
continue
|
||||
}
|
||||
|
||||
// Check for empty/incomplete state
|
||||
var issues []string
|
||||
if state.Name == "" {
|
||||
issues = append(issues, "missing name")
|
||||
}
|
||||
if state.Rig == "" {
|
||||
issues = append(issues, "missing rig")
|
||||
}
|
||||
if state.ClonePath == "" {
|
||||
issues = append(issues, "missing clone_path")
|
||||
}
|
||||
|
||||
if len(issues) > 0 {
|
||||
issue := strings.Join(issues, ", ")
|
||||
c.invalidCrews = append(c.invalidCrews, invalidCrew{
|
||||
path: cd.path,
|
||||
stateFile: stateFile,
|
||||
rigName: cd.rigName,
|
||||
crewName: cd.crewName,
|
||||
issue: issue,
|
||||
})
|
||||
details = append(details, fmt.Sprintf("%s/%s: %s", cd.rigName, cd.crewName, issue))
|
||||
} else {
|
||||
validCount++
|
||||
}
|
||||
}
|
||||
|
||||
if len(c.invalidCrews) == 0 {
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusOK,
|
||||
Message: fmt.Sprintf("All %d crew state files valid", validCount),
|
||||
}
|
||||
}
|
||||
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusWarning,
|
||||
Message: fmt.Sprintf("%d crew workspace(s) with invalid state.json", len(c.invalidCrews)),
|
||||
Details: details,
|
||||
FixHint: "Run 'gt doctor --fix' to regenerate state files",
|
||||
}
|
||||
}
|
||||
|
||||
// Fix regenerates invalid state.json files with correct values.
|
||||
func (c *CrewStateCheck) Fix(ctx *CheckContext) error {
|
||||
if len(c.invalidCrews) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
for _, ic := range c.invalidCrews {
|
||||
state := map[string]interface{}{
|
||||
"name": ic.crewName,
|
||||
"rig": ic.rigName,
|
||||
"clone_path": ic.path,
|
||||
"branch": "main",
|
||||
"created_at": time.Now().Format(time.RFC3339),
|
||||
"updated_at": time.Now().Format(time.RFC3339),
|
||||
}
|
||||
|
||||
data, err := json.MarshalIndent(state, "", " ")
|
||||
if err != nil {
|
||||
lastErr = fmt.Errorf("%s/%s: %w", ic.rigName, ic.crewName, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := os.WriteFile(ic.stateFile, data, 0644); err != nil {
|
||||
lastErr = fmt.Errorf("%s/%s: %w", ic.rigName, ic.crewName, err)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
return lastErr
|
||||
}
|
||||
|
||||
type crewDir struct {
|
||||
path string
|
||||
rigName string
|
||||
crewName string
|
||||
}
|
||||
|
||||
// findAllCrewDirs finds all crew directories in the workspace.
|
||||
func (c *CrewStateCheck) findAllCrewDirs(townRoot string) []crewDir {
|
||||
var dirs []crewDir
|
||||
|
||||
entries, err := os.ReadDir(townRoot)
|
||||
if err != nil {
|
||||
return dirs
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() || strings.HasPrefix(entry.Name(), ".") || entry.Name() == "mayor" {
|
||||
continue
|
||||
}
|
||||
|
||||
rigName := entry.Name()
|
||||
crewPath := filepath.Join(townRoot, rigName, "crew")
|
||||
|
||||
crewEntries, err := os.ReadDir(crewPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, crew := range crewEntries {
|
||||
if !crew.IsDir() || strings.HasPrefix(crew.Name(), ".") {
|
||||
continue
|
||||
}
|
||||
dirs = append(dirs, crewDir{
|
||||
path: filepath.Join(crewPath, crew.Name()),
|
||||
rigName: rigName,
|
||||
crewName: crew.Name(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return dirs
|
||||
}
|
||||
302
internal/doctor/lifecycle_check.go
Normal file
302
internal/doctor/lifecycle_check.go
Normal file
@@ -0,0 +1,302 @@
|
||||
package doctor
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// LifecycleHygieneCheck detects and cleans up stale lifecycle state.
|
||||
// This can happen when:
|
||||
// - Lifecycle messages weren't properly deleted after processing
|
||||
// - Agent state.json has stuck requesting_* flags
|
||||
// - Session was manually killed without clearing state
|
||||
type LifecycleHygieneCheck struct {
|
||||
FixableCheck
|
||||
staleMessages []staleMessage
|
||||
stuckStateFiles []stuckState
|
||||
}
|
||||
|
||||
type staleMessage struct {
|
||||
ID string
|
||||
Subject string
|
||||
From string
|
||||
}
|
||||
|
||||
type stuckState struct {
|
||||
stateFile string
|
||||
identity string
|
||||
flag string
|
||||
}
|
||||
|
||||
// NewLifecycleHygieneCheck creates a new lifecycle hygiene check.
|
||||
func NewLifecycleHygieneCheck() *LifecycleHygieneCheck {
|
||||
return &LifecycleHygieneCheck{
|
||||
FixableCheck: FixableCheck{
|
||||
BaseCheck: BaseCheck{
|
||||
CheckName: "lifecycle-hygiene",
|
||||
CheckDescription: "Check for stale lifecycle messages and stuck state flags",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Run checks for stale lifecycle state.
|
||||
func (c *LifecycleHygieneCheck) Run(ctx *CheckContext) *CheckResult {
|
||||
c.staleMessages = nil
|
||||
c.stuckStateFiles = nil
|
||||
|
||||
var details []string
|
||||
|
||||
// Check for stale lifecycle messages in deacon inbox
|
||||
staleCount := c.checkDeaconInbox(ctx)
|
||||
if staleCount > 0 {
|
||||
details = append(details, fmt.Sprintf("%d stale lifecycle message(s) in deacon inbox", staleCount))
|
||||
}
|
||||
|
||||
// Check for stuck requesting_* flags in state files
|
||||
stuckCount := c.checkStateFiles(ctx)
|
||||
if stuckCount > 0 {
|
||||
details = append(details, fmt.Sprintf("%d agent(s) with stuck requesting_* flags", stuckCount))
|
||||
}
|
||||
|
||||
total := staleCount + stuckCount
|
||||
if total == 0 {
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusOK,
|
||||
Message: "No stale lifecycle state found",
|
||||
}
|
||||
}
|
||||
|
||||
return &CheckResult{
|
||||
Name: c.Name(),
|
||||
Status: StatusWarning,
|
||||
Message: fmt.Sprintf("Found %d lifecycle hygiene issue(s)", total),
|
||||
Details: details,
|
||||
FixHint: "Run 'gt doctor --fix' to clean up",
|
||||
}
|
||||
}
|
||||
|
||||
// checkDeaconInbox looks for stale lifecycle messages.
|
||||
func (c *LifecycleHygieneCheck) checkDeaconInbox(ctx *CheckContext) int {
|
||||
// Get deacon inbox via gt mail
|
||||
cmd := exec.Command("gt", "mail", "inbox", "--identity", "deacon/", "--json")
|
||||
cmd.Dir = ctx.TownRoot
|
||||
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return 0 // Can't check, assume OK
|
||||
}
|
||||
|
||||
if len(output) == 0 || string(output) == "[]" || string(output) == "[]\n" {
|
||||
return 0
|
||||
}
|
||||
|
||||
var messages []struct {
|
||||
ID string `json:"id"`
|
||||
From string `json:"from"`
|
||||
Subject string `json:"subject"`
|
||||
}
|
||||
if err := json.Unmarshal(output, &messages); err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Look for lifecycle messages
|
||||
for _, msg := range messages {
|
||||
if strings.HasPrefix(strings.ToLower(msg.Subject), "lifecycle:") {
|
||||
c.staleMessages = append(c.staleMessages, staleMessage{
|
||||
ID: msg.ID,
|
||||
Subject: msg.Subject,
|
||||
From: msg.From,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return len(c.staleMessages)
|
||||
}
|
||||
|
||||
// checkStateFiles looks for stuck requesting_* flags in state.json files.
|
||||
func (c *LifecycleHygieneCheck) checkStateFiles(ctx *CheckContext) int {
|
||||
stateFiles := c.findStateFiles(ctx.TownRoot)
|
||||
|
||||
for _, sf := range stateFiles {
|
||||
data, err := os.ReadFile(sf.path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
var state map[string]interface{}
|
||||
if err := json.Unmarshal(data, &state); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check for any requesting_* flags
|
||||
for key, val := range state {
|
||||
if strings.HasPrefix(key, "requesting_") {
|
||||
if boolVal, ok := val.(bool); ok && boolVal {
|
||||
// Found a stuck flag - verify session is actually healthy
|
||||
if c.isSessionHealthy(sf.identity) {
|
||||
c.stuckStateFiles = append(c.stuckStateFiles, stuckState{
|
||||
stateFile: sf.path,
|
||||
identity: sf.identity,
|
||||
flag: key,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return len(c.stuckStateFiles)
|
||||
}
|
||||
|
||||
type stateFileInfo struct {
|
||||
path string
|
||||
identity string
|
||||
}
|
||||
|
||||
// findStateFiles locates all state.json files for agents.
|
||||
func (c *LifecycleHygieneCheck) findStateFiles(townRoot string) []stateFileInfo {
|
||||
var files []stateFileInfo
|
||||
|
||||
// Mayor state
|
||||
mayorState := filepath.Join(townRoot, "mayor", "state.json")
|
||||
if _, err := os.Stat(mayorState); err == nil {
|
||||
files = append(files, stateFileInfo{path: mayorState, identity: "mayor"})
|
||||
}
|
||||
|
||||
// Scan rigs for witness, refinery, and crew state files
|
||||
entries, err := os.ReadDir(townRoot)
|
||||
if err != nil {
|
||||
return files
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() || strings.HasPrefix(entry.Name(), ".") || entry.Name() == "mayor" {
|
||||
continue
|
||||
}
|
||||
|
||||
rigName := entry.Name()
|
||||
rigPath := filepath.Join(townRoot, rigName)
|
||||
|
||||
// Witness state
|
||||
witnessState := filepath.Join(rigPath, "witness", "state.json")
|
||||
if _, err := os.Stat(witnessState); err == nil {
|
||||
files = append(files, stateFileInfo{
|
||||
path: witnessState,
|
||||
identity: rigName + "-witness",
|
||||
})
|
||||
}
|
||||
|
||||
// Refinery state
|
||||
refineryState := filepath.Join(rigPath, "refinery", "state.json")
|
||||
if _, err := os.Stat(refineryState); err == nil {
|
||||
files = append(files, stateFileInfo{
|
||||
path: refineryState,
|
||||
identity: rigName + "-refinery",
|
||||
})
|
||||
}
|
||||
|
||||
// Crew state files
|
||||
crewPath := filepath.Join(rigPath, "crew")
|
||||
crewEntries, err := os.ReadDir(crewPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, crew := range crewEntries {
|
||||
if !crew.IsDir() || strings.HasPrefix(crew.Name(), ".") {
|
||||
continue
|
||||
}
|
||||
crewState := filepath.Join(crewPath, crew.Name(), "state.json")
|
||||
if _, err := os.Stat(crewState); err == nil {
|
||||
files = append(files, stateFileInfo{
|
||||
path: crewState,
|
||||
identity: rigName + "-crew-" + crew.Name(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return files
|
||||
}
|
||||
|
||||
// isSessionHealthy checks if the tmux session for this identity exists and is running.
|
||||
func (c *LifecycleHygieneCheck) isSessionHealthy(identity string) bool {
|
||||
sessionName := identityToSessionName(identity)
|
||||
if sessionName == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check if session exists
|
||||
cmd := exec.Command("tmux", "has-session", "-t", sessionName)
|
||||
return cmd.Run() == nil
|
||||
}
|
||||
|
||||
// identityToSessionName converts an identity to its tmux session name.
|
||||
func identityToSessionName(identity string) string {
|
||||
switch identity {
|
||||
case "mayor":
|
||||
return "gt-mayor"
|
||||
default:
|
||||
if strings.HasSuffix(identity, "-witness") ||
|
||||
strings.HasSuffix(identity, "-refinery") ||
|
||||
strings.Contains(identity, "-crew-") {
|
||||
return "gt-" + identity
|
||||
}
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// Fix cleans up stale lifecycle state.
|
||||
func (c *LifecycleHygieneCheck) Fix(ctx *CheckContext) error {
|
||||
var errors []string
|
||||
|
||||
// Delete stale lifecycle messages
|
||||
for _, msg := range c.staleMessages {
|
||||
cmd := exec.Command("gt", "mail", "delete", msg.ID)
|
||||
cmd.Dir = ctx.TownRoot
|
||||
if err := cmd.Run(); err != nil {
|
||||
errors = append(errors, fmt.Sprintf("failed to delete message %s: %v", msg.ID, err))
|
||||
}
|
||||
}
|
||||
|
||||
// Clear stuck requesting_* flags
|
||||
for _, stuck := range c.stuckStateFiles {
|
||||
if err := c.clearRequestingFlag(stuck); err != nil {
|
||||
errors = append(errors, fmt.Sprintf("failed to clear %s in %s: %v", stuck.flag, stuck.identity, err))
|
||||
}
|
||||
}
|
||||
|
||||
if len(errors) > 0 {
|
||||
return fmt.Errorf("%s", strings.Join(errors, "; "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// clearRequestingFlag removes the stuck requesting_* flag from a state file.
|
||||
func (c *LifecycleHygieneCheck) clearRequestingFlag(stuck stuckState) error {
|
||||
data, err := os.ReadFile(stuck.stateFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var state map[string]interface{}
|
||||
if err := json.Unmarshal(data, &state); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Remove the requesting flag and any associated timestamp
|
||||
delete(state, stuck.flag)
|
||||
delete(state, "requesting_time")
|
||||
|
||||
newData, err := json.MarshalIndent(state, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return os.WriteFile(stuck.stateFile, newData, 0644)
|
||||
}
|
||||
Reference in New Issue
Block a user