Add daemon health check endpoint (bd-146)
- Add OpHealth RPC operation to protocol - Implement handleHealth() with DB ping and 1s timeout - Returns status (healthy/degraded/unhealthy), uptime, cache metrics - Update TryConnect() to use health check instead of ping - Add 'bd daemon --health' CLI command with JSON output - Track cache hits/misses for metrics - Unhealthy daemon triggers automatic fallback to direct mode - Health check completes in <2 seconds Amp-Thread-ID: https://ampcode.com/threads/T-1a4889f3-77cf-433a-a704-e1c383929f48 Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
1
.beads/bd.sock.startlock
Normal file
1
.beads/bd.sock.startlock
Normal file
@@ -0,0 +1 @@
|
|||||||
|
46949
|
||||||
@@ -50,7 +50,7 @@
|
|||||||
{"id":"bd-143","title":"Bias ready work towards recent issues before oldest-first","description":"Currently 'bd ready' shows oldest issues first (by created_at). This can bury recently discovered work that might be more relevant. Propose a hybrid approach: show issues from the past 1-2 days first (sorted by priority), then fall back to oldest-first for older issues. This keeps fresh discoveries visible while still surfacing old forgotten work.","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-10-18T09:31:15.036495-07:00","updated_at":"2025-10-18T09:57:28.105887-07:00","closed_at":"2025-10-18T09:35:55.084891-07:00"}
|
{"id":"bd-143","title":"Bias ready work towards recent issues before oldest-first","description":"Currently 'bd ready' shows oldest issues first (by created_at). This can bury recently discovered work that might be more relevant. Propose a hybrid approach: show issues from the past 1-2 days first (sorted by priority), then fall back to oldest-first for older issues. This keeps fresh discoveries visible while still surfacing old forgotten work.","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-10-18T09:31:15.036495-07:00","updated_at":"2025-10-18T09:57:28.105887-07:00","closed_at":"2025-10-18T09:35:55.084891-07:00"}
|
||||||
{"id":"bd-144","title":"Fix nil pointer dereference in renumber command","description":"The 'bd renumber' command crashes with a nil pointer dereference at renumber.go:52 because store is nil. The command doesn't properly handle daemon/direct mode initialization like other commands do. Error occurs on both --dry-run and --force modes.","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-10-18T09:54:31.59912-07:00","updated_at":"2025-10-18T09:57:28.106373-07:00","closed_at":"2025-10-18T09:56:49.88701-07:00"}
|
{"id":"bd-144","title":"Fix nil pointer dereference in renumber command","description":"The 'bd renumber' command crashes with a nil pointer dereference at renumber.go:52 because store is nil. The command doesn't properly handle daemon/direct mode initialization like other commands do. Error occurs on both --dry-run and --force modes.","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-10-18T09:54:31.59912-07:00","updated_at":"2025-10-18T09:57:28.106373-07:00","closed_at":"2025-10-18T09:56:49.88701-07:00"}
|
||||||
{"id":"bd-145","title":"Add storage cache eviction policy to daemon","description":"Daemon caches DB connections forever in storageCache map (server.go:29). For users with 50+ repos, this causes memory leaks and file descriptor exhaustion.\n\nNeed LRU cache with:\n- Max size limit (default: 50 repos)\n- TTL-based eviction (default: 30min idle)\n- Periodic cleanup goroutine\n\nLocation: internal/rpc/server.go:29-40","design":"Add StorageCacheEntry struct with lastAccess timestamp.\n\nImplement evictStaleStorage() method that runs every 5 minutes to close connections idle \u003e30min.\n\nAdd max cache size enforcement (LRU eviction when full).\n\nMake limits configurable via env vars:\n- BEADS_DAEMON_MAX_CACHE_SIZE (default: 50)\n- BEADS_DAEMON_CACHE_TTL (default: 30m)","acceptance_criteria":"- Cache evicts entries after 30min idle\n- Cache respects max size limit\n- Cleanup goroutine runs periodically\n- Evicted storage connections are properly closed\n- No resource leaks under sustained load\n- Unit tests for eviction logic","status":"closed","priority":0,"issue_type":"feature","created_at":"2025-10-18T13:05:46.174245-07:00","updated_at":"2025-10-18T13:16:56.921023-07:00","closed_at":"2025-10-18T13:16:56.921023-07:00","dependencies":[{"issue_id":"bd-145","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.077954-07:00","created_by":"daemon"}]}
|
{"id":"bd-145","title":"Add storage cache eviction policy to daemon","description":"Daemon caches DB connections forever in storageCache map (server.go:29). For users with 50+ repos, this causes memory leaks and file descriptor exhaustion.\n\nNeed LRU cache with:\n- Max size limit (default: 50 repos)\n- TTL-based eviction (default: 30min idle)\n- Periodic cleanup goroutine\n\nLocation: internal/rpc/server.go:29-40","design":"Add StorageCacheEntry struct with lastAccess timestamp.\n\nImplement evictStaleStorage() method that runs every 5 minutes to close connections idle \u003e30min.\n\nAdd max cache size enforcement (LRU eviction when full).\n\nMake limits configurable via env vars:\n- BEADS_DAEMON_MAX_CACHE_SIZE (default: 50)\n- BEADS_DAEMON_CACHE_TTL (default: 30m)","acceptance_criteria":"- Cache evicts entries after 30min idle\n- Cache respects max size limit\n- Cleanup goroutine runs periodically\n- Evicted storage connections are properly closed\n- No resource leaks under sustained load\n- Unit tests for eviction logic","status":"closed","priority":0,"issue_type":"feature","created_at":"2025-10-18T13:05:46.174245-07:00","updated_at":"2025-10-18T13:16:56.921023-07:00","closed_at":"2025-10-18T13:16:56.921023-07:00","dependencies":[{"issue_id":"bd-145","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.077954-07:00","created_by":"daemon"}]}
|
||||||
{"id":"bd-146","title":"Add daemon health check endpoint and probes","description":"Auto-start only checks socket existence, not daemon responsiveness. Daemon can be running but unresponsive (deadlock, hung DB). Users work in degraded direct mode without knowing why.\n\nNeed health check RPC operation that:\n- Tests DB connectivity (1s timeout)\n- Returns uptime, status, metrics\n- Used by auto-start before connecting\n- Enables monitoring/alerting\n\nLocation: internal/rpc/server.go, cmd/bd/main.go:100-108","design":"Add OpHealth RPC operation to protocol.\n\nhandleHealth() implementation:\n- Quick DB ping with 1s timeout\n- Return status, uptime, version\n- Include basic metrics (connections, cache size)\n\nUpdate TryConnect() to call Health() after socket connection:\n- If health check fails, close connection and return nil\n- Enables transparent failover to direct mode\n\nAdd 'bd daemon --health' CLI command for monitoring.","acceptance_criteria":"- Health check RPC endpoint works\n- Returns structured health status\n- Client uses health check before operations\n- bd daemon --health command exists\n- Unhealthy daemon triggers auto-restart or fallback\n- Health check completes in \u003c2 seconds","status":"open","priority":0,"issue_type":"feature","created_at":"2025-10-18T13:05:58.647592-07:00","updated_at":"2025-10-18T13:05:58.647592-07:00","dependencies":[{"issue_id":"bd-146","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.093618-07:00","created_by":"daemon"}]}
|
{"id":"bd-146","title":"Add daemon health check endpoint and probes","description":"Auto-start only checks socket existence, not daemon responsiveness. Daemon can be running but unresponsive (deadlock, hung DB). Users work in degraded direct mode without knowing why.\n\nNeed health check RPC operation that:\n- Tests DB connectivity (1s timeout)\n- Returns uptime, status, metrics\n- Used by auto-start before connecting\n- Enables monitoring/alerting\n\nLocation: internal/rpc/server.go, cmd/bd/main.go:100-108","design":"Add OpHealth RPC operation to protocol.\n\nhandleHealth() implementation:\n- Quick DB ping with 1s timeout\n- Return status, uptime, version\n- Include basic metrics (connections, cache size)\n\nUpdate TryConnect() to call Health() after socket connection:\n- If health check fails, close connection and return nil\n- Enables transparent failover to direct mode\n\nAdd 'bd daemon --health' CLI command for monitoring.","acceptance_criteria":"- Health check RPC endpoint works\n- Returns structured health status\n- Client uses health check before operations\n- bd daemon --health command exists\n- Unhealthy daemon triggers auto-restart or fallback\n- Health check completes in \u003c2 seconds","status":"closed","priority":0,"issue_type":"feature","created_at":"2025-10-18T13:05:58.647592-07:00","updated_at":"2025-10-18T13:32:15.106003-07:00","closed_at":"2025-10-18T13:32:15.106003-07:00","dependencies":[{"issue_id":"bd-146","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.093618-07:00","created_by":"daemon"}]}
|
||||||
{"id":"bd-147","title":"Add stale socket and crash recovery for daemon","description":"When daemon crashes (panic, OOM, signal), socket file remains and blocks new daemon start. Users must manually remove .beads/bd.sock.\n\nProblems:\n- Socket file remains after crash\n- PID file remains (isDaemonRunning false positive)\n- No automatic recovery\n- Users get 'daemon already running' error\n\nLocation: cmd/bd/daemon.go, cmd/bd/main.go:221-311","design":"Improve stale detection in tryAutoStartDaemon():\n\n1. If socket exists, try to connect\n2. If connection fails → stale socket, remove it\n3. Also remove PID file and lock files\n4. Retry daemon start\n\nAdd self-healing to daemon startup:\n- On startup, check for stale PID files\n- If PID in file doesn't exist, remove and continue\n- Use exclusive file lock to prevent races\n\nOptional: Add crash recovery watchdog that restarts daemon on exit.","acceptance_criteria":"- Stale sockets are automatically detected and removed\n- Auto-start recovers from daemon crashes\n- No manual intervention needed for crash recovery\n- PID file management is robust\n- Lock files prevent multiple daemon instances\n- Tests for crash recovery scenarios","status":"open","priority":0,"issue_type":"bug","created_at":"2025-10-18T13:06:10.116917-07:00","updated_at":"2025-10-18T13:06:10.116917-07:00","dependencies":[{"issue_id":"bd-147","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.108099-07:00","created_by":"daemon"}]}
|
{"id":"bd-147","title":"Add stale socket and crash recovery for daemon","description":"When daemon crashes (panic, OOM, signal), socket file remains and blocks new daemon start. Users must manually remove .beads/bd.sock.\n\nProblems:\n- Socket file remains after crash\n- PID file remains (isDaemonRunning false positive)\n- No automatic recovery\n- Users get 'daemon already running' error\n\nLocation: cmd/bd/daemon.go, cmd/bd/main.go:221-311","design":"Improve stale detection in tryAutoStartDaemon():\n\n1. If socket exists, try to connect\n2. If connection fails → stale socket, remove it\n3. Also remove PID file and lock files\n4. Retry daemon start\n\nAdd self-healing to daemon startup:\n- On startup, check for stale PID files\n- If PID in file doesn't exist, remove and continue\n- Use exclusive file lock to prevent races\n\nOptional: Add crash recovery watchdog that restarts daemon on exit.","acceptance_criteria":"- Stale sockets are automatically detected and removed\n- Auto-start recovers from daemon crashes\n- No manual intervention needed for crash recovery\n- PID file management is robust\n- Lock files prevent multiple daemon instances\n- Tests for crash recovery scenarios","status":"open","priority":0,"issue_type":"bug","created_at":"2025-10-18T13:06:10.116917-07:00","updated_at":"2025-10-18T13:06:10.116917-07:00","dependencies":[{"issue_id":"bd-147","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.108099-07:00","created_by":"daemon"}]}
|
||||||
{"id":"bd-148","title":"Add lifecycle management for beads-mcp processes","description":"MCP server processes accumulate without cleanup. Each tool invocation spawns a new Python process that lingers after Claude disconnects.\n\nObserved: 6+ beads-mcp processes running simultaneously.\n\nProblems:\n- No parent-child relationship tracking\n- No cleanup on MCP client disconnect\n- Processes leak over days of use\n- Could accumulate hundreds of processes\n\nLocation: integrations/beads-mcp/src/beads_mcp/server.py","design":"Add proper cleanup handlers to MCP server:\n\n1. Register atexit handler to close daemon connections\n2. Handle SIGTERM/SIGINT for graceful shutdown\n3. Close daemon client in cleanup()\n4. Remove any temp files\n\nOptional improvements:\n- Track active connections to daemon\n- Implement connection pooling\n- Add process timeout/TTL\n- Log lifecycle events for debugging\n\nExample:\nimport atexit\nimport signal\n\ndef cleanup():\n # Close daemon connections\n # Remove temp files\n pass\n\natexit.register(cleanup)\nsignal.signal(signal.SIGTERM, lambda s, f: cleanup())","acceptance_criteria":"- MCP processes clean up on exit\n- Daemon connections are properly closed\n- No process leaks after repeated use\n- Signal handlers work correctly\n- Cleanup runs on normal and abnormal exit\n- Test with multiple concurrent MCP invocations","status":"open","priority":0,"issue_type":"bug","created_at":"2025-10-18T13:06:22.030027-07:00","updated_at":"2025-10-18T13:06:22.030027-07:00","dependencies":[{"issue_id":"bd-148","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.121494-07:00","created_by":"daemon"}]}
|
{"id":"bd-148","title":"Add lifecycle management for beads-mcp processes","description":"MCP server processes accumulate without cleanup. Each tool invocation spawns a new Python process that lingers after Claude disconnects.\n\nObserved: 6+ beads-mcp processes running simultaneously.\n\nProblems:\n- No parent-child relationship tracking\n- No cleanup on MCP client disconnect\n- Processes leak over days of use\n- Could accumulate hundreds of processes\n\nLocation: integrations/beads-mcp/src/beads_mcp/server.py","design":"Add proper cleanup handlers to MCP server:\n\n1. Register atexit handler to close daemon connections\n2. Handle SIGTERM/SIGINT for graceful shutdown\n3. Close daemon client in cleanup()\n4. Remove any temp files\n\nOptional improvements:\n- Track active connections to daemon\n- Implement connection pooling\n- Add process timeout/TTL\n- Log lifecycle events for debugging\n\nExample:\nimport atexit\nimport signal\n\ndef cleanup():\n # Close daemon connections\n # Remove temp files\n pass\n\natexit.register(cleanup)\nsignal.signal(signal.SIGTERM, lambda s, f: cleanup())","acceptance_criteria":"- MCP processes clean up on exit\n- Daemon connections are properly closed\n- No process leaks after repeated use\n- Signal handlers work correctly\n- Cleanup runs on normal and abnormal exit\n- Test with multiple concurrent MCP invocations","status":"open","priority":0,"issue_type":"bug","created_at":"2025-10-18T13:06:22.030027-07:00","updated_at":"2025-10-18T13:06:22.030027-07:00","dependencies":[{"issue_id":"bd-148","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.121494-07:00","created_by":"daemon"}]}
|
||||||
{"id":"bd-149","title":"Add global daemon auto-start support","description":"Auto-start only works for local daemon. Users with multiple repos must manually run 'bd daemon --global'.\n\nProblems:\n- No detection of whether global daemon is preferable\n- No migration path from local → global\n- Multi-repo users don't discover global daemon\n- Manual setup required\n\nLocation: cmd/bd/main.go:221-311","design":"Add heuristics to shouldUseGlobalDaemon():\n\n1. Count .beads repos under home directory\n2. If \u003e3 repos found, prefer global daemon\n3. Check BEADS_PREFER_GLOBAL_DAEMON env var\n4. Check config file preference\n\nUpdate tryAutoStartDaemon() to:\n- Use shouldUseGlobalDaemon() to pick mode\n- Pass --global flag when appropriate\n- Log decision for debugging\n\nAdd migration helper:\n- Detect running local daemon\n- Suggest switching to global if multi-repo detected\n- bd daemon --migrate-to-global command","acceptance_criteria":"- Auto-start uses global daemon when appropriate\n- Multi-repo detection works correctly\n- Users can configure preference\n- Migration path is smooth\n- Both local and global auto-start work\n- Documentation updated","status":"open","priority":1,"issue_type":"feature","created_at":"2025-10-18T13:06:33.633238-07:00","updated_at":"2025-10-18T13:06:33.633238-07:00","dependencies":[{"issue_id":"bd-149","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.135552-07:00","created_by":"daemon"}]}
|
{"id":"bd-149","title":"Add global daemon auto-start support","description":"Auto-start only works for local daemon. Users with multiple repos must manually run 'bd daemon --global'.\n\nProblems:\n- No detection of whether global daemon is preferable\n- No migration path from local → global\n- Multi-repo users don't discover global daemon\n- Manual setup required\n\nLocation: cmd/bd/main.go:221-311","design":"Add heuristics to shouldUseGlobalDaemon():\n\n1. Count .beads repos under home directory\n2. If \u003e3 repos found, prefer global daemon\n3. Check BEADS_PREFER_GLOBAL_DAEMON env var\n4. Check config file preference\n\nUpdate tryAutoStartDaemon() to:\n- Use shouldUseGlobalDaemon() to pick mode\n- Pass --global flag when appropriate\n- Log decision for debugging\n\nAdd migration helper:\n- Detect running local daemon\n- Suggest switching to global if multi-repo detected\n- bd daemon --migrate-to-global command","acceptance_criteria":"- Auto-start uses global daemon when appropriate\n- Multi-repo detection works correctly\n- Users can configure preference\n- Migration path is smooth\n- Both local and global auto-start work\n- Documentation updated","status":"open","priority":1,"issue_type":"feature","created_at":"2025-10-18T13:06:33.633238-07:00","updated_at":"2025-10-18T13:06:33.633238-07:00","dependencies":[{"issue_id":"bd-149","depends_on_id":"bd-155","type":"parent-child","created_at":"2025-10-18T13:07:49.135552-07:00","created_by":"daemon"}]}
|
||||||
|
|||||||
@@ -38,10 +38,12 @@ The daemon will:
|
|||||||
- Auto-import when remote changes detected
|
- Auto-import when remote changes detected
|
||||||
|
|
||||||
Use --stop to stop a running daemon.
|
Use --stop to stop a running daemon.
|
||||||
Use --status to check if daemon is running.`,
|
Use --status to check if daemon is running.
|
||||||
|
Use --health to check daemon health and metrics.`,
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
stop, _ := cmd.Flags().GetBool("stop")
|
stop, _ := cmd.Flags().GetBool("stop")
|
||||||
status, _ := cmd.Flags().GetBool("status")
|
status, _ := cmd.Flags().GetBool("status")
|
||||||
|
health, _ := cmd.Flags().GetBool("health")
|
||||||
interval, _ := cmd.Flags().GetDuration("interval")
|
interval, _ := cmd.Flags().GetDuration("interval")
|
||||||
autoCommit, _ := cmd.Flags().GetBool("auto-commit")
|
autoCommit, _ := cmd.Flags().GetBool("auto-commit")
|
||||||
autoPush, _ := cmd.Flags().GetBool("auto-push")
|
autoPush, _ := cmd.Flags().GetBool("auto-push")
|
||||||
@@ -64,6 +66,11 @@ Use --status to check if daemon is running.`,
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if health {
|
||||||
|
showDaemonHealth(global)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if stop {
|
if stop {
|
||||||
stopDaemon(pidFile)
|
stopDaemon(pidFile)
|
||||||
return
|
return
|
||||||
@@ -119,6 +126,7 @@ func init() {
|
|||||||
daemonCmd.Flags().Bool("auto-push", false, "Automatically push commits")
|
daemonCmd.Flags().Bool("auto-push", false, "Automatically push commits")
|
||||||
daemonCmd.Flags().Bool("stop", false, "Stop running daemon")
|
daemonCmd.Flags().Bool("stop", false, "Stop running daemon")
|
||||||
daemonCmd.Flags().Bool("status", false, "Show daemon status")
|
daemonCmd.Flags().Bool("status", false, "Show daemon status")
|
||||||
|
daemonCmd.Flags().Bool("health", false, "Check daemon health and metrics")
|
||||||
daemonCmd.Flags().String("log", "", "Log file path (default: .beads/daemon.log)")
|
daemonCmd.Flags().String("log", "", "Log file path (default: .beads/daemon.log)")
|
||||||
daemonCmd.Flags().Bool("global", false, "Run as global daemon (socket at ~/.beads/bd.sock)")
|
daemonCmd.Flags().Bool("global", false, "Run as global daemon (socket at ~/.beads/bd.sock)")
|
||||||
rootCmd.AddCommand(daemonCmd)
|
rootCmd.AddCommand(daemonCmd)
|
||||||
@@ -250,6 +258,78 @@ func showDaemonStatus(pidFile string, global bool) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func showDaemonHealth(global bool) {
|
||||||
|
var socketPath string
|
||||||
|
if global {
|
||||||
|
home, err := os.UserHomeDir()
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error: cannot get home directory: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
socketPath = filepath.Join(home, ".beads", "bd.sock")
|
||||||
|
} else {
|
||||||
|
beadsDir, err := ensureBeadsDir()
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
socketPath = filepath.Join(beadsDir, "bd.sock")
|
||||||
|
}
|
||||||
|
|
||||||
|
client, err := rpc.TryConnect(socketPath)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error connecting to daemon: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
if client == nil {
|
||||||
|
fmt.Println("✗ Daemon is not running")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
defer client.Close()
|
||||||
|
|
||||||
|
health, err := client.Health()
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error checking health: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
if jsonOutput {
|
||||||
|
data, _ := json.MarshalIndent(health, "", " ")
|
||||||
|
fmt.Println(string(data))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
statusIcon := "✓"
|
||||||
|
if health.Status == "unhealthy" {
|
||||||
|
statusIcon = "✗"
|
||||||
|
} else if health.Status == "degraded" {
|
||||||
|
statusIcon = "⚠"
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("%s Daemon Health: %s\n", statusIcon, health.Status)
|
||||||
|
fmt.Printf(" Version: %s\n", health.Version)
|
||||||
|
fmt.Printf(" Uptime: %.1f seconds\n", health.Uptime)
|
||||||
|
fmt.Printf(" Cache Size: %d databases\n", health.CacheSize)
|
||||||
|
fmt.Printf(" Cache Hits: %d\n", health.CacheHits)
|
||||||
|
fmt.Printf(" Cache Misses: %d\n", health.CacheMisses)
|
||||||
|
|
||||||
|
if health.CacheHits+health.CacheMisses > 0 {
|
||||||
|
hitRate := float64(health.CacheHits) / float64(health.CacheHits+health.CacheMisses) * 100
|
||||||
|
fmt.Printf(" Cache Hit Rate: %.1f%%\n", hitRate)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf(" DB Response Time: %.2f ms\n", health.DBResponseTime)
|
||||||
|
|
||||||
|
if health.Error != "" {
|
||||||
|
fmt.Printf(" Error: %s\n", health.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
if health.Status == "unhealthy" {
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func stopDaemon(pidFile string) {
|
func stopDaemon(pidFile string) {
|
||||||
if isRunning, pid := isDaemonRunning(pidFile); !isRunning {
|
if isRunning, pid := isDaemonRunning(pidFile); !isRunning {
|
||||||
fmt.Println("Daemon is not running")
|
fmt.Println("Daemon is not running")
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ type Client struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TryConnect attempts to connect to the daemon socket
|
// TryConnect attempts to connect to the daemon socket
|
||||||
// Returns nil if no daemon is running
|
// Returns nil if no daemon is running or unhealthy
|
||||||
func TryConnect(socketPath string) (*Client, error) {
|
func TryConnect(socketPath string) (*Client, error) {
|
||||||
if _, err := os.Stat(socketPath); os.IsNotExist(err) {
|
if _, err := os.Stat(socketPath); os.IsNotExist(err) {
|
||||||
if os.Getenv("BD_DEBUG") != "" {
|
if os.Getenv("BD_DEBUG") != "" {
|
||||||
@@ -40,14 +40,28 @@ func TryConnect(socketPath string) (*Client, error) {
|
|||||||
timeout: 30 * time.Second,
|
timeout: 30 * time.Second,
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := client.Ping(); err != nil {
|
health, err := client.Health()
|
||||||
|
if err != nil {
|
||||||
if os.Getenv("BD_DEBUG") != "" {
|
if os.Getenv("BD_DEBUG") != "" {
|
||||||
fmt.Fprintf(os.Stderr, "Debug: ping failed: %v\n", err)
|
fmt.Fprintf(os.Stderr, "Debug: health check failed: %v\n", err)
|
||||||
}
|
}
|
||||||
conn.Close()
|
conn.Close()
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if health.Status == "unhealthy" {
|
||||||
|
if os.Getenv("BD_DEBUG") != "" {
|
||||||
|
fmt.Fprintf(os.Stderr, "Debug: daemon unhealthy: %s\n", health.Error)
|
||||||
|
}
|
||||||
|
conn.Close()
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if os.Getenv("BD_DEBUG") != "" {
|
||||||
|
fmt.Fprintf(os.Stderr, "Debug: connected to daemon (status: %s, uptime: %.1fs, cache: %d)\n",
|
||||||
|
health.Status, health.Uptime, health.CacheSize)
|
||||||
|
}
|
||||||
|
|
||||||
return client, nil
|
return client, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -131,6 +145,21 @@ func (c *Client) Ping() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Health sends a health check request to verify the daemon is healthy
|
||||||
|
func (c *Client) Health() (*HealthResponse, error) {
|
||||||
|
resp, err := c.Execute(OpHealth, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var health HealthResponse
|
||||||
|
if err := json.Unmarshal(resp.Data, &health); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to unmarshal health response: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &health, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Create creates a new issue via the daemon
|
// Create creates a new issue via the daemon
|
||||||
func (c *Client) Create(args *CreateArgs) (*Response, error) {
|
func (c *Client) Create(args *CreateArgs) (*Response, error) {
|
||||||
return c.Execute(OpCreate, args)
|
return c.Execute(OpCreate, args)
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import (
|
|||||||
// Operation constants for all bd commands
|
// Operation constants for all bd commands
|
||||||
const (
|
const (
|
||||||
OpPing = "ping"
|
OpPing = "ping"
|
||||||
|
OpHealth = "health"
|
||||||
OpCreate = "create"
|
OpCreate = "create"
|
||||||
OpUpdate = "update"
|
OpUpdate = "update"
|
||||||
OpClose = "close"
|
OpClose = "close"
|
||||||
@@ -137,6 +138,18 @@ type PingResponse struct {
|
|||||||
Version string `json:"version"`
|
Version string `json:"version"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// HealthResponse is the response for a health check operation
|
||||||
|
type HealthResponse struct {
|
||||||
|
Status string `json:"status"` // "healthy", "degraded", "unhealthy"
|
||||||
|
Version string `json:"version"`
|
||||||
|
Uptime float64 `json:"uptime_seconds"`
|
||||||
|
CacheSize int `json:"cache_size"`
|
||||||
|
CacheHits int64 `json:"cache_hits"`
|
||||||
|
CacheMisses int64 `json:"cache_misses"`
|
||||||
|
DBResponseTime float64 `json:"db_response_ms"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
// BatchArgs represents arguments for batch operations
|
// BatchArgs represents arguments for batch operations
|
||||||
type BatchArgs struct {
|
type BatchArgs struct {
|
||||||
Operations []BatchOperation `json:"operations"`
|
Operations []BatchOperation `json:"operations"`
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -40,6 +41,10 @@ type Server struct {
|
|||||||
maxCacheSize int
|
maxCacheSize int
|
||||||
cacheTTL time.Duration
|
cacheTTL time.Duration
|
||||||
cleanupTicker *time.Ticker
|
cleanupTicker *time.Ticker
|
||||||
|
// Health and metrics
|
||||||
|
startTime time.Time
|
||||||
|
cacheHits int64
|
||||||
|
cacheMisses int64
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewServer creates a new RPC server
|
// NewServer creates a new RPC server
|
||||||
@@ -68,6 +73,7 @@ func NewServer(socketPath string, store storage.Storage) *Server {
|
|||||||
maxCacheSize: maxCacheSize,
|
maxCacheSize: maxCacheSize,
|
||||||
cacheTTL: cacheTTL,
|
cacheTTL: cacheTTL,
|
||||||
shutdownChan: make(chan struct{}),
|
shutdownChan: make(chan struct{}),
|
||||||
|
startTime: time.Now(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -274,6 +280,8 @@ func (s *Server) handleRequest(req *Request) Response {
|
|||||||
switch req.Operation {
|
switch req.Operation {
|
||||||
case OpPing:
|
case OpPing:
|
||||||
return s.handlePing(req)
|
return s.handlePing(req)
|
||||||
|
case OpHealth:
|
||||||
|
return s.handleHealth(req)
|
||||||
case OpCreate:
|
case OpCreate:
|
||||||
return s.handleCreate(req)
|
return s.handleCreate(req)
|
||||||
case OpUpdate:
|
case OpUpdate:
|
||||||
@@ -379,6 +387,66 @@ func (s *Server) handlePing(_ *Request) Response {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Server) handleHealth(req *Request) Response {
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
store, err := s.getStorageForRequest(req)
|
||||||
|
if err != nil {
|
||||||
|
data, _ := json.Marshal(HealthResponse{
|
||||||
|
Status: "unhealthy",
|
||||||
|
Version: "0.9.8",
|
||||||
|
Uptime: time.Since(s.startTime).Seconds(),
|
||||||
|
Error: fmt.Sprintf("storage error: %v", err),
|
||||||
|
})
|
||||||
|
return Response{
|
||||||
|
Success: false,
|
||||||
|
Data: data,
|
||||||
|
Error: fmt.Sprintf("storage error: %v", err),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
healthCtx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
status := "healthy"
|
||||||
|
dbError := ""
|
||||||
|
|
||||||
|
_, pingErr := store.GetStatistics(healthCtx)
|
||||||
|
dbResponseMs := time.Since(start).Seconds() * 1000
|
||||||
|
|
||||||
|
if pingErr != nil {
|
||||||
|
status = "unhealthy"
|
||||||
|
dbError = pingErr.Error()
|
||||||
|
} else if dbResponseMs > 500 {
|
||||||
|
status = "degraded"
|
||||||
|
}
|
||||||
|
|
||||||
|
s.cacheMu.RLock()
|
||||||
|
cacheSize := len(s.storageCache)
|
||||||
|
s.cacheMu.RUnlock()
|
||||||
|
|
||||||
|
health := HealthResponse{
|
||||||
|
Status: status,
|
||||||
|
Version: "0.9.8",
|
||||||
|
Uptime: time.Since(s.startTime).Seconds(),
|
||||||
|
CacheSize: cacheSize,
|
||||||
|
CacheHits: atomic.LoadInt64(&s.cacheHits),
|
||||||
|
CacheMisses: atomic.LoadInt64(&s.cacheMisses),
|
||||||
|
DBResponseTime: dbResponseMs,
|
||||||
|
}
|
||||||
|
|
||||||
|
if dbError != "" {
|
||||||
|
health.Error = dbError
|
||||||
|
}
|
||||||
|
|
||||||
|
data, _ := json.Marshal(health)
|
||||||
|
return Response{
|
||||||
|
Success: status != "unhealthy",
|
||||||
|
Data: data,
|
||||||
|
Error: dbError,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Server) handleCreate(req *Request) Response {
|
func (s *Server) handleCreate(req *Request) Response {
|
||||||
var createArgs CreateArgs
|
var createArgs CreateArgs
|
||||||
if err := json.Unmarshal(req.Args, &createArgs); err != nil {
|
if err := json.Unmarshal(req.Args, &createArgs); err != nil {
|
||||||
@@ -849,8 +917,11 @@ func (s *Server) getStorageForRequest(req *Request) (storage.Storage, error) {
|
|||||||
if entry, ok := s.storageCache[repoRoot]; ok {
|
if entry, ok := s.storageCache[repoRoot]; ok {
|
||||||
// Update last access time (safe under Lock)
|
// Update last access time (safe under Lock)
|
||||||
entry.lastAccess = time.Now()
|
entry.lastAccess = time.Now()
|
||||||
|
atomic.AddInt64(&s.cacheHits, 1)
|
||||||
return entry.store, nil
|
return entry.store, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddInt64(&s.cacheMisses, 1)
|
||||||
|
|
||||||
// Open storage
|
// Open storage
|
||||||
store, err := sqlite.New(dbPath)
|
store, err := sqlite.New(dbPath)
|
||||||
|
|||||||
Reference in New Issue
Block a user