From 0a283d5148f0661e2009f100f92f7fd5eed83383 Mon Sep 17 00:00:00 2001 From: Steve Yegge Date: Sun, 26 Oct 2025 18:21:55 -0700 Subject: [PATCH] bd-151: Implement bd daemons health subcommand --- .beads/beads.jsonl | 2 +- cmd/bd/daemons.go | 129 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/.beads/beads.jsonl b/.beads/beads.jsonl index 6418bf2b..684cbc15 100644 --- a/.beads/beads.jsonl +++ b/.beads/beads.jsonl @@ -56,7 +56,7 @@ {"id":"bd-149","title":"Add auto-cleanup of stale sockets and dead processes","description":"When discovering daemons, automatically detect and clean up stale socket files (where process is dead) and orphaned PID files. Should be safe and only remove confirmed-dead processes.","status":"closed","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.246629-07:00","updated_at":"2025-10-26T18:17:18.560526-07:00","closed_at":"2025-10-26T18:17:18.560526-07:00","dependencies":[{"issue_id":"bd-149","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.247788-07:00","created_by":"daemon"}]} {"id":"bd-15","title":"Phase 4: Gradual Cutover \u0026 Production Rollout","description":"Replace SQLite implementation with Beads library in production and remove legacy code.\n\n**Goal:** Complete transition to Beads library, deprecate and remove custom SQLite implementation.\n\n**Key Tasks:**\n1. Run VC executor with Beads library in CI\n2. Dogfood: Use Beads library for VC's own development\n3. Monitor for regressions and performance issues\n4. Flip feature flag: VC_USE_BEADS_LIBRARY=true by default\n5. Monitor production logs for errors\n6. Collect user feedback\n7. Add deprecation notice to CLAUDE.md\n8. Provide migration guide for users\n9. Remove legacy code: internal/storage/sqlite/sqlite.go (~1500 lines)\n10. Remove migration framework: internal/storage/migrations/\n11. Remove manual transaction management code\n12. Update all documentation\n\n**Acceptance Criteria:**\n- Beads library enabled by default in production\n- Zero production incidents related to migration\n- Performance meets or exceeds SQLite implementation\n- All tests passing with Beads library\n- Legacy SQLite code removed\n- Documentation updated\n- Celebration documented 🎉\n\n**Rollout Strategy:**\n1. Week 1: Enable for CI/testing environments\n2. Week 2: Dogfood on VC development\n3. Week 3: Enable for 50% of production (canary)\n4. Week 4: Enable for 100% of production\n5. Week 5: Remove legacy code\n\n**Monitoring:**\n- Track error rates before/after cutover\n- Monitor database query performance\n- Track issue creation/update latency\n- Monitor executor claim performance\n\n**Rollback Plan:**\n- Keep VC_FORCE_SQLITE=true escape hatch for 2 weeks post-cutover\n- Keep legacy code for 1 sprint after cutover\n- Document rollback procedure\n\n**Success Metrics:**\n- Zero data loss\n- No performance regression (\u003c 5% latency increase acceptable)\n- Reduced maintenance burden (code LOC reduction)\n- Positive developer feedback\n\n**Dependencies:**\n- Blocked by Phase 3 (need migration tooling)\n\n**Estimated Effort:** 1 sprint","status":"closed","priority":2,"issue_type":"task","created_at":"2025-10-22T14:05:07.755107-07:00","updated_at":"2025-10-25T23:15:33.474948-07:00","closed_at":"2025-10-22T21:37:48.748919-07:00","dependencies":[{"issue_id":"bd-15","depends_on_id":"bd-11","type":"parent-child","created_at":"2025-10-24T13:17:40.324637-07:00","created_by":"renumber"},{"issue_id":"bd-15","depends_on_id":"bd-14","type":"blocks","created_at":"2025-10-24T13:17:40.324851-07:00","created_by":"renumber"}]} {"id":"bd-150","title":"Update AGENTS.md and README.md with \"bd daemons\" documentation","description":"Document the new \"bd daemons\" command and all subcommands in AGENTS.md and README.md. Include examples and troubleshooting guidance.","status":"open","priority":2,"issue_type":"task","created_at":"2025-10-26T16:54:00.254006-07:00","updated_at":"2025-10-26T16:54:00.254006-07:00","dependencies":[{"issue_id":"bd-150","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.254862-07:00","created_by":"daemon"}]} -{"id":"bd-151","title":"Implement \"bd daemons health\" subcommand","description":"Add health check command that pings each daemon and reports responsiveness. Should detect and report stale sockets, version mismatches, unresponsive daemons.","status":"open","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.255444-07:00","updated_at":"2025-10-26T16:54:00.255444-07:00","dependencies":[{"issue_id":"bd-151","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T17:47:47.949848-07:00","created_by":"stevey"}]} +{"id":"bd-151","title":"Implement \"bd daemons health\" subcommand","description":"Add health check command that pings each daemon and reports responsiveness. Should detect and report stale sockets, version mismatches, unresponsive daemons.","status":"in_progress","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.255444-07:00","updated_at":"2025-10-26T18:18:46.312334-07:00","dependencies":[{"issue_id":"bd-151","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T17:47:47.949848-07:00","created_by":"stevey"}]} {"id":"bd-152","title":"Implement \"bd daemons logs\" subcommand","description":"Add command to view daemon logs for a specific workspace. Requires daemon logging to file (may need separate issue for log infrastructure).","status":"open","priority":2,"issue_type":"task","created_at":"2025-10-26T16:54:00.256037-07:00","updated_at":"2025-10-26T16:54:00.256037-07:00","dependencies":[{"issue_id":"bd-152","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.256797-07:00","created_by":"daemon"}]} {"id":"bd-153","title":"Implement \"bd daemons killall\" subcommand","description":"Add emergency command to stop all running bd daemons. Should discover all daemons and stop them gracefully (with timeout fallback to SIGKILL).","status":"open","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.258822-07:00","updated_at":"2025-10-26T16:54:00.258822-07:00","dependencies":[{"issue_id":"bd-153","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.259421-07:00","created_by":"daemon"}]} {"id":"bd-154","title":"Implement \"bd daemons stop\" and \"bd daemons restart\" subcommands","description":"Add commands to stop and restart individual daemons by path or PID. Should send graceful shutdown signal via socket, with fallback to SIGTERM.","status":"open","priority":1,"issue_type":"task","created_at":"2025-10-26T16:54:00.259875-07:00","updated_at":"2025-10-26T16:54:00.259875-07:00","dependencies":[{"issue_id":"bd-154","depends_on_id":"bd-145","type":"parent-child","created_at":"2025-10-26T16:54:00.260433-07:00","created_by":"daemon"}]} diff --git a/cmd/bd/daemons.go b/cmd/bd/daemons.go index 04923fbc..24957a77 100644 --- a/cmd/bd/daemons.go +++ b/cmd/bd/daemons.go @@ -124,14 +124,143 @@ func formatDaemonRelativeTime(t time.Time) string { return fmt.Sprintf("%.1fd ago", d.Hours()/24) } +var daemonsHealthCmd = &cobra.Command{ + Use: "health", + Short: "Check health of all bd daemons", + Long: `Check health of all running bd daemons and report any issues including +stale sockets, version mismatches, and unresponsive daemons.`, + Run: func(cmd *cobra.Command, args []string) { + searchRoots, _ := cmd.Flags().GetStringSlice("search") + jsonOutput, _ := cmd.Flags().GetBool("json") + + // Discover daemons + daemons, err := daemon.DiscoverDaemons(searchRoots) + if err != nil { + fmt.Fprintf(os.Stderr, "Error discovering daemons: %v\n", err) + os.Exit(1) + } + + type healthReport struct { + Workspace string `json:"workspace"` + SocketPath string `json:"socket_path"` + PID int `json:"pid,omitempty"` + Version string `json:"version,omitempty"` + Status string `json:"status"` + Issue string `json:"issue,omitempty"` + VersionMismatch bool `json:"version_mismatch,omitempty"` + } + + var reports []healthReport + healthyCount := 0 + staleCount := 0 + mismatchCount := 0 + unresponsiveCount := 0 + + currentVersion := Version + + for _, d := range daemons { + report := healthReport{ + Workspace: d.WorkspacePath, + SocketPath: d.SocketPath, + PID: d.PID, + Version: d.Version, + } + + if !d.Alive { + report.Status = "stale" + report.Issue = d.Error + staleCount++ + } else if d.Version != currentVersion { + report.Status = "version_mismatch" + report.Issue = fmt.Sprintf("daemon version %s != client version %s", d.Version, currentVersion) + report.VersionMismatch = true + mismatchCount++ + } else { + report.Status = "healthy" + healthyCount++ + } + + reports = append(reports, report) + } + + if jsonOutput { + output := map[string]interface{}{ + "total": len(reports), + "healthy": healthyCount, + "stale": staleCount, + "mismatched": mismatchCount, + "unresponsive": unresponsiveCount, + "daemons": reports, + } + data, _ := json.MarshalIndent(output, "", " ") + fmt.Println(string(data)) + return + } + + // Human-readable output + if len(reports) == 0 { + fmt.Println("No daemons found") + return + } + + fmt.Printf("Health Check Summary:\n") + fmt.Printf(" Total: %d\n", len(reports)) + fmt.Printf(" Healthy: %d\n", healthyCount) + fmt.Printf(" Stale: %d\n", staleCount) + fmt.Printf(" Mismatched: %d\n", mismatchCount) + fmt.Printf(" Unresponsive: %d\n\n", unresponsiveCount) + + w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) + fmt.Fprintln(w, "WORKSPACE\tPID\tVERSION\tSTATUS\tISSUE") + + for _, r := range reports { + workspace := r.Workspace + if workspace == "" { + workspace = "(unknown)" + } + + pidStr := "-" + if r.PID != 0 { + pidStr = fmt.Sprintf("%d", r.PID) + } + + version := r.Version + if version == "" { + version = "-" + } + + status := r.Status + issue := r.Issue + if issue == "" { + issue = "-" + } + + fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", + workspace, pidStr, version, status, issue) + } + + w.Flush() + + // Exit with error if there are any issues + if staleCount > 0 || mismatchCount > 0 || unresponsiveCount > 0 { + os.Exit(1) + } + }, +} + func init() { rootCmd.AddCommand(daemonsCmd) // Add subcommands daemonsCmd.AddCommand(daemonsListCmd) + daemonsCmd.AddCommand(daemonsHealthCmd) // Flags for list command daemonsListCmd.Flags().StringSlice("search", nil, "Directories to search for daemons (default: home, /tmp, cwd)") daemonsListCmd.Flags().Bool("json", false, "Output in JSON format") daemonsListCmd.Flags().Bool("no-cleanup", false, "Skip auto-cleanup of stale sockets") + + // Flags for health command + daemonsHealthCmd.Flags().StringSlice("search", nil, "Directories to search for daemons (default: home, /tmp, cwd)") + daemonsHealthCmd.Flags().Bool("json", false, "Output in JSON format") }