bd-151: Implement bd daemons health subcommand

This commit is contained in:
Steve Yegge
2025-10-26 18:21:55 -07:00
parent 1c7de68bfc
commit 0a283d5148
2 changed files with 130 additions and 1 deletions

View File

@@ -56,7 +56,7 @@
{"id":"bd-149","title":"Add auto-cleanup of stale sockets and dead processes","description":"When discovering daemons, automatically detect and clean up stale socket files (where process is dead) and orphaned PID files. Should be safe and only remove confirmed-dead processes.","status":"closed","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.246629-07:00","updated_at":"2025-10-26T18:17:18.560526-07:00","closed_at":"2025-10-26T18:17:18.560526-07:00","dependencies":[{"issue_id":"bd-149","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.247788-07:00","created_by":"daemon"}]}
{"id":"bd-15","title":"Phase 4: Gradual Cutover \u0026 Production Rollout","description":"Replace SQLite implementation with Beads library in production and remove legacy code.\n\n**Goal:** Complete transition to Beads library, deprecate and remove custom SQLite implementation.\n\n**Key Tasks:**\n1. Run VC executor with Beads library in CI\n2. Dogfood: Use Beads library for VC's own development\n3. Monitor for regressions and performance issues\n4. Flip feature flag: VC_USE_BEADS_LIBRARY=true by default\n5. Monitor production logs for errors\n6. Collect user feedback\n7. Add deprecation notice to CLAUDE.md\n8. Provide migration guide for users\n9. Remove legacy code: internal/storage/sqlite/sqlite.go (~1500 lines)\n10. Remove migration framework: internal/storage/migrations/\n11. Remove manual transaction management code\n12. Update all documentation\n\n**Acceptance Criteria:**\n- Beads library enabled by default in production\n- Zero production incidents related to migration\n- Performance meets or exceeds SQLite implementation\n- All tests passing with Beads library\n- Legacy SQLite code removed\n- Documentation updated\n- Celebration documented 🎉\n\n**Rollout Strategy:**\n1. Week 1: Enable for CI/testing environments\n2. Week 2: Dogfood on VC development\n3. Week 3: Enable for 50% of production (canary)\n4. Week 4: Enable for 100% of production\n5. Week 5: Remove legacy code\n\n**Monitoring:**\n- Track error rates before/after cutover\n- Monitor database query performance\n- Track issue creation/update latency\n- Monitor executor claim performance\n\n**Rollback Plan:**\n- Keep VC_FORCE_SQLITE=true escape hatch for 2 weeks post-cutover\n- Keep legacy code for 1 sprint after cutover\n- Document rollback procedure\n\n**Success Metrics:**\n- Zero data loss\n- No performance regression (\u003c 5% latency increase acceptable)\n- Reduced maintenance burden (code LOC reduction)\n- Positive developer feedback\n\n**Dependencies:**\n- Blocked by Phase 3 (need migration tooling)\n\n**Estimated Effort:** 1 sprint","status":"closed","priority":2,"issue_type":"task","created_at":"2025-10-22T14:05:07.755107-07:00","updated_at":"2025-10-25T23:15:33.474948-07:00","closed_at":"2025-10-22T21:37:48.748919-07:00","dependencies":[{"issue_id":"bd-15","depends_on_id":"bd-11","type":"parent-child","created_at":"2025-10-24T13:17:40.324637-07:00","created_by":"renumber"},{"issue_id":"bd-15","depends_on_id":"bd-14","type":"blocks","created_at":"2025-10-24T13:17:40.324851-07:00","created_by":"renumber"}]}
{"id":"bd-150","title":"Update AGENTS.md and README.md with \"bd daemons\" documentation","description":"Document the new \"bd daemons\" command and all subcommands in AGENTS.md and README.md. Include examples and troubleshooting guidance.","status":"open","priority":2,"issue_type":"task","created_at":"2025-10-26T16:54:00.254006-07:00","updated_at":"2025-10-26T16:54:00.254006-07:00","dependencies":[{"issue_id":"bd-150","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.254862-07:00","created_by":"daemon"}]}
{"id":"bd-151","title":"Implement \"bd daemons health\" subcommand","description":"Add health check command that pings each daemon and reports responsiveness. Should detect and report stale sockets, version mismatches, unresponsive daemons.","status":"open","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.255444-07:00","updated_at":"2025-10-26T16:54:00.255444-07:00","dependencies":[{"issue_id":"bd-151","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T17:47:47.949848-07:00","created_by":"stevey"}]}
{"id":"bd-151","title":"Implement \"bd daemons health\" subcommand","description":"Add health check command that pings each daemon and reports responsiveness. Should detect and report stale sockets, version mismatches, unresponsive daemons.","status":"in_progress","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.255444-07:00","updated_at":"2025-10-26T18:18:46.312334-07:00","dependencies":[{"issue_id":"bd-151","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T17:47:47.949848-07:00","created_by":"stevey"}]}
{"id":"bd-152","title":"Implement \"bd daemons logs\" subcommand","description":"Add command to view daemon logs for a specific workspace. Requires daemon logging to file (may need separate issue for log infrastructure).","status":"open","priority":2,"issue_type":"task","created_at":"2025-10-26T16:54:00.256037-07:00","updated_at":"2025-10-26T16:54:00.256037-07:00","dependencies":[{"issue_id":"bd-152","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.256797-07:00","created_by":"daemon"}]}
{"id":"bd-153","title":"Implement \"bd daemons killall\" subcommand","description":"Add emergency command to stop all running bd daemons. Should discover all daemons and stop them gracefully (with timeout fallback to SIGKILL).","status":"open","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.258822-07:00","updated_at":"2025-10-26T16:54:00.258822-07:00","dependencies":[{"issue_id":"bd-153","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.259421-07:00","created_by":"daemon"}]}
{"id":"bd-154","title":"Implement \"bd daemons stop\" and \"bd daemons restart\" subcommands","description":"Add commands to stop and restart individual daemons by path or PID. Should send graceful shutdown signal via socket, with fallback to SIGTERM.","status":"open","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.259875-07:00","updated_at":"2025-10-26T16:54:00.259875-07:00","dependencies":[{"issue_id":"bd-154","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.260433-07:00","created_by":"daemon"}]}

View File

@@ -124,14 +124,143 @@ func formatDaemonRelativeTime(t time.Time) string {
return fmt.Sprintf("%.1fd ago", d.Hours()/24)
}
var daemonsHealthCmd = &cobra.Command{
Use: "health",
Short: "Check health of all bd daemons",
Long: `Check health of all running bd daemons and report any issues including
stale sockets, version mismatches, and unresponsive daemons.`,
Run: func(cmd *cobra.Command, args []string) {
searchRoots, _ := cmd.Flags().GetStringSlice("search")
jsonOutput, _ := cmd.Flags().GetBool("json")
// Discover daemons
daemons, err := daemon.DiscoverDaemons(searchRoots)
if err != nil {
fmt.Fprintf(os.Stderr, "Error discovering daemons: %v\n", err)
os.Exit(1)
}
type healthReport struct {
Workspace string `json:"workspace"`
SocketPath string `json:"socket_path"`
PID int `json:"pid,omitempty"`
Version string `json:"version,omitempty"`
Status string `json:"status"`
Issue string `json:"issue,omitempty"`
VersionMismatch bool `json:"version_mismatch,omitempty"`
}
var reports []healthReport
healthyCount := 0
staleCount := 0
mismatchCount := 0
unresponsiveCount := 0
currentVersion := Version
for _, d := range daemons {
report := healthReport{
Workspace: d.WorkspacePath,
SocketPath: d.SocketPath,
PID: d.PID,
Version: d.Version,
}
if !d.Alive {
report.Status = "stale"
report.Issue = d.Error
staleCount++
} else if d.Version != currentVersion {
report.Status = "version_mismatch"
report.Issue = fmt.Sprintf("daemon version %s != client version %s", d.Version, currentVersion)
report.VersionMismatch = true
mismatchCount++
} else {
report.Status = "healthy"
healthyCount++
}
reports = append(reports, report)
}
if jsonOutput {
output := map[string]interface{}{
"total": len(reports),
"healthy": healthyCount,
"stale": staleCount,
"mismatched": mismatchCount,
"unresponsive": unresponsiveCount,
"daemons": reports,
}
data, _ := json.MarshalIndent(output, "", " ")
fmt.Println(string(data))
return
}
// Human-readable output
if len(reports) == 0 {
fmt.Println("No daemons found")
return
}
fmt.Printf("Health Check Summary:\n")
fmt.Printf(" Total: %d\n", len(reports))
fmt.Printf(" Healthy: %d\n", healthyCount)
fmt.Printf(" Stale: %d\n", staleCount)
fmt.Printf(" Mismatched: %d\n", mismatchCount)
fmt.Printf(" Unresponsive: %d\n\n", unresponsiveCount)
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
fmt.Fprintln(w, "WORKSPACE\tPID\tVERSION\tSTATUS\tISSUE")
for _, r := range reports {
workspace := r.Workspace
if workspace == "" {
workspace = "(unknown)"
}
pidStr := "-"
if r.PID != 0 {
pidStr = fmt.Sprintf("%d", r.PID)
}
version := r.Version
if version == "" {
version = "-"
}
status := r.Status
issue := r.Issue
if issue == "" {
issue = "-"
}
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n",
workspace, pidStr, version, status, issue)
}
w.Flush()
// Exit with error if there are any issues
if staleCount > 0 || mismatchCount > 0 || unresponsiveCount > 0 {
os.Exit(1)
}
},
}
func init() {
rootCmd.AddCommand(daemonsCmd)
// Add subcommands
daemonsCmd.AddCommand(daemonsListCmd)
daemonsCmd.AddCommand(daemonsHealthCmd)
// Flags for list command
daemonsListCmd.Flags().StringSlice("search", nil, "Directories to search for daemons (default: home, /tmp, cwd)")
daemonsListCmd.Flags().Bool("json", false, "Output in JSON format")
daemonsListCmd.Flags().Bool("no-cleanup", false, "Skip auto-cleanup of stale sockets")
// Flags for health command
daemonsHealthCmd.Flags().StringSlice("search", nil, "Directories to search for daemons (default: home, /tmp, cwd)")
daemonsHealthCmd.Flags().Bool("json", false, "Output in JSON format")
}