Add telemetry and observability to daemon (bd-153)
Implement comprehensive metrics collection for the daemon with zero-overhead design: Features: - Request metrics: counts, latency percentiles (p50, p95, p99), error rates - Cache metrics: hit/miss ratios, eviction counts, database connections - Connection metrics: total, active, rejected connections - System metrics: memory usage, goroutine count, uptime Implementation: - New internal/rpc/metrics.go with Metrics collector - OpMetrics RPC operation for programmatic access - 'bd daemon --metrics' command (human-readable and JSON output) - Lock-free atomic operations for cache/connection metrics - Copy-and-compute pattern in Snapshot to minimize lock contention - Deferred metrics recording ensures all requests are tracked Improvements from code review: - JSON types use float64 for ms/seconds (not time.Duration) - Snapshot copies data under short lock, computes outside - Union of operations from counts and errors maps - Defensive clamping in percentile calculation - Defer pattern ensures metrics recorded even on early returns Documentation updated in README.md with usage examples. Closes bd-153 Amp-Thread-ID: https://ampcode.com/threads/T-20213187-65c7-47f7-ba21-5234c9e52e26 Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
101
cmd/bd/daemon.go
101
cmd/bd/daemon.go
@@ -45,6 +45,7 @@ Use --health to check daemon health and metrics.`,
|
||||
stop, _ := cmd.Flags().GetBool("stop")
|
||||
status, _ := cmd.Flags().GetBool("status")
|
||||
health, _ := cmd.Flags().GetBool("health")
|
||||
metrics, _ := cmd.Flags().GetBool("metrics")
|
||||
migrateToGlobal, _ := cmd.Flags().GetBool("migrate-to-global")
|
||||
interval, _ := cmd.Flags().GetDuration("interval")
|
||||
autoCommit, _ := cmd.Flags().GetBool("auto-commit")
|
||||
@@ -73,6 +74,11 @@ Use --health to check daemon health and metrics.`,
|
||||
return
|
||||
}
|
||||
|
||||
if metrics {
|
||||
showDaemonMetrics(global)
|
||||
return
|
||||
}
|
||||
|
||||
if migrateToGlobal {
|
||||
migrateToGlobalDaemon()
|
||||
return
|
||||
@@ -134,6 +140,7 @@ func init() {
|
||||
daemonCmd.Flags().Bool("stop", false, "Stop running daemon")
|
||||
daemonCmd.Flags().Bool("status", false, "Show daemon status")
|
||||
daemonCmd.Flags().Bool("health", false, "Check daemon health and metrics")
|
||||
daemonCmd.Flags().Bool("metrics", false, "Show detailed daemon metrics")
|
||||
daemonCmd.Flags().Bool("migrate-to-global", false, "Migrate from local to global daemon")
|
||||
daemonCmd.Flags().String("log", "", "Log file path (default: .beads/daemon.log)")
|
||||
daemonCmd.Flags().Bool("global", false, "Run as global daemon (socket at ~/.beads/bd.sock)")
|
||||
@@ -356,6 +363,100 @@ func showDaemonHealth(global bool) {
|
||||
}
|
||||
}
|
||||
|
||||
func showDaemonMetrics(global bool) {
|
||||
var socketPath string
|
||||
if global {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error: cannot get home directory: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
socketPath = filepath.Join(home, ".beads", "bd.sock")
|
||||
} else {
|
||||
beadsDir, err := ensureBeadsDir()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
socketPath = filepath.Join(beadsDir, "bd.sock")
|
||||
}
|
||||
|
||||
client, err := rpc.TryConnect(socketPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error connecting to daemon: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if client == nil {
|
||||
fmt.Println("✗ Daemon is not running")
|
||||
os.Exit(1)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
metrics, err := client.Metrics()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error fetching metrics: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if jsonOutput {
|
||||
data, _ := json.MarshalIndent(metrics, "", " ")
|
||||
fmt.Println(string(data))
|
||||
return
|
||||
}
|
||||
|
||||
// Human-readable output
|
||||
fmt.Printf("Daemon Metrics\n")
|
||||
fmt.Printf("==============\n\n")
|
||||
|
||||
fmt.Printf("Uptime: %.1f seconds (%.1f minutes)\n", metrics.UptimeSeconds, metrics.UptimeSeconds/60)
|
||||
fmt.Printf("Timestamp: %s\n\n", metrics.Timestamp.Format(time.RFC3339))
|
||||
|
||||
// Cache metrics
|
||||
fmt.Printf("Cache Metrics:\n")
|
||||
fmt.Printf(" Size: %d databases\n", metrics.CacheSize)
|
||||
fmt.Printf(" Hits: %d\n", metrics.CacheHits)
|
||||
fmt.Printf(" Misses: %d\n", metrics.CacheMisses)
|
||||
if metrics.CacheHits+metrics.CacheMisses > 0 {
|
||||
hitRate := float64(metrics.CacheHits) / float64(metrics.CacheHits+metrics.CacheMisses) * 100
|
||||
fmt.Printf(" Hit Rate: %.1f%%\n", hitRate)
|
||||
}
|
||||
fmt.Printf(" Evictions: %d\n\n", metrics.CacheEvictions)
|
||||
|
||||
// Connection metrics
|
||||
fmt.Printf("Connection Metrics:\n")
|
||||
fmt.Printf(" Total: %d\n", metrics.TotalConns)
|
||||
fmt.Printf(" Active: %d\n", metrics.ActiveConns)
|
||||
fmt.Printf(" Rejected: %d\n\n", metrics.RejectedConns)
|
||||
|
||||
// System metrics
|
||||
fmt.Printf("System Metrics:\n")
|
||||
fmt.Printf(" Memory Alloc: %d MB\n", metrics.MemoryAllocMB)
|
||||
fmt.Printf(" Memory Sys: %d MB\n", metrics.MemorySysMB)
|
||||
fmt.Printf(" Goroutines: %d\n\n", metrics.GoroutineCount)
|
||||
|
||||
// Operation metrics
|
||||
if len(metrics.Operations) > 0 {
|
||||
fmt.Printf("Operation Metrics:\n")
|
||||
for _, op := range metrics.Operations {
|
||||
fmt.Printf("\n %s:\n", op.Operation)
|
||||
fmt.Printf(" Total Requests: %d\n", op.TotalCount)
|
||||
fmt.Printf(" Successful: %d\n", op.SuccessCount)
|
||||
fmt.Printf(" Errors: %d\n", op.ErrorCount)
|
||||
|
||||
if op.Latency.AvgMS > 0 {
|
||||
fmt.Printf(" Latency:\n")
|
||||
fmt.Printf(" Min: %.3f ms\n", op.Latency.MinMS)
|
||||
fmt.Printf(" Avg: %.3f ms\n", op.Latency.AvgMS)
|
||||
fmt.Printf(" P50: %.3f ms\n", op.Latency.P50MS)
|
||||
fmt.Printf(" P95: %.3f ms\n", op.Latency.P95MS)
|
||||
fmt.Printf(" P99: %.3f ms\n", op.Latency.P99MS)
|
||||
fmt.Printf(" Max: %.3f ms\n", op.Latency.MaxMS)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func migrateToGlobalDaemon() {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user