Add telemetry and observability to daemon (bd-153)
Implement comprehensive metrics collection for the daemon with zero-overhead design: Features: - Request metrics: counts, latency percentiles (p50, p95, p99), error rates - Cache metrics: hit/miss ratios, eviction counts, database connections - Connection metrics: total, active, rejected connections - System metrics: memory usage, goroutine count, uptime Implementation: - New internal/rpc/metrics.go with Metrics collector - OpMetrics RPC operation for programmatic access - 'bd daemon --metrics' command (human-readable and JSON output) - Lock-free atomic operations for cache/connection metrics - Copy-and-compute pattern in Snapshot to minimize lock contention - Deferred metrics recording ensures all requests are tracked Improvements from code review: - JSON types use float64 for ms/seconds (not time.Duration) - Snapshot copies data under short lock, computes outside - Union of operations from counts and errors maps - Defensive clamping in percentile calculation - Defer pattern ensures metrics recorded even on early returns Documentation updated in README.md with usage examples. Closes bd-153 Amp-Thread-ID: https://ampcode.com/threads/T-20213187-65c7-47f7-ba21-5234c9e52e26 Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
@@ -165,6 +165,21 @@ func (c *Client) Health() (*HealthResponse, error) {
|
||||
return &health, nil
|
||||
}
|
||||
|
||||
// Metrics retrieves daemon metrics
|
||||
func (c *Client) Metrics() (*MetricsSnapshot, error) {
|
||||
resp, err := c.Execute(OpMetrics, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var metrics MetricsSnapshot
|
||||
if err := json.Unmarshal(resp.Data, &metrics); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal metrics response: %w", err)
|
||||
}
|
||||
|
||||
return &metrics, nil
|
||||
}
|
||||
|
||||
// Create creates a new issue via the daemon
|
||||
func (c *Client) Create(args *CreateArgs) (*Response, error) {
|
||||
return c.Execute(OpCreate, args)
|
||||
|
||||
252
internal/rpc/metrics.go
Normal file
252
internal/rpc/metrics.go
Normal file
@@ -0,0 +1,252 @@
|
||||
package rpc
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Metrics holds all telemetry data for the daemon
|
||||
type Metrics struct {
|
||||
mu sync.RWMutex
|
||||
|
||||
// Request metrics
|
||||
requestCounts map[string]int64 // operation -> count
|
||||
requestErrors map[string]int64 // operation -> error count
|
||||
requestLatency map[string][]time.Duration // operation -> latency samples (bounded slice)
|
||||
maxSamples int
|
||||
|
||||
// Connection metrics
|
||||
totalConns int64
|
||||
rejectedConns int64
|
||||
|
||||
// Cache metrics (handled separately via atomic in Server)
|
||||
cacheEvictions int64
|
||||
|
||||
// System start time (for uptime calculation)
|
||||
startTime time.Time
|
||||
}
|
||||
|
||||
// NewMetrics creates a new metrics collector
|
||||
func NewMetrics() *Metrics {
|
||||
return &Metrics{
|
||||
requestCounts: make(map[string]int64),
|
||||
requestErrors: make(map[string]int64),
|
||||
requestLatency: make(map[string][]time.Duration),
|
||||
maxSamples: 1000, // Keep last 1000 samples per operation
|
||||
startTime: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// RecordRequest records a request (successful or failed)
|
||||
func (m *Metrics) RecordRequest(operation string, latency time.Duration) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
m.requestCounts[operation]++
|
||||
|
||||
// Add latency sample to bounded slice
|
||||
samples := m.requestLatency[operation]
|
||||
if len(samples) >= m.maxSamples {
|
||||
// Drop oldest sample to maintain max size
|
||||
samples = samples[1:]
|
||||
}
|
||||
samples = append(samples, latency)
|
||||
m.requestLatency[operation] = samples
|
||||
}
|
||||
|
||||
// RecordError records a failed request
|
||||
func (m *Metrics) RecordError(operation string) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
m.requestErrors[operation]++
|
||||
}
|
||||
|
||||
// RecordConnection records a new connection
|
||||
func (m *Metrics) RecordConnection() {
|
||||
atomic.AddInt64(&m.totalConns, 1)
|
||||
}
|
||||
|
||||
// RecordRejectedConnection records a rejected connection (max conns reached)
|
||||
func (m *Metrics) RecordRejectedConnection() {
|
||||
atomic.AddInt64(&m.rejectedConns, 1)
|
||||
}
|
||||
|
||||
// RecordCacheEviction records a cache eviction event
|
||||
func (m *Metrics) RecordCacheEviction() {
|
||||
atomic.AddInt64(&m.cacheEvictions, 1)
|
||||
}
|
||||
|
||||
// Snapshot returns a point-in-time snapshot of all metrics
|
||||
func (m *Metrics) Snapshot(cacheHits, cacheMisses int64, cacheSize, activeConns int) MetricsSnapshot {
|
||||
// Copy data under a short critical section
|
||||
m.mu.RLock()
|
||||
|
||||
// Build union of all operations (from both counts and errors)
|
||||
opsSet := make(map[string]struct{})
|
||||
for op := range m.requestCounts {
|
||||
opsSet[op] = struct{}{}
|
||||
}
|
||||
for op := range m.requestErrors {
|
||||
opsSet[op] = struct{}{}
|
||||
}
|
||||
|
||||
// Copy counts, errors, and latency slices
|
||||
countsCopy := make(map[string]int64, len(opsSet))
|
||||
errorsCopy := make(map[string]int64, len(opsSet))
|
||||
latCopy := make(map[string][]time.Duration, len(opsSet))
|
||||
|
||||
for op := range opsSet {
|
||||
countsCopy[op] = m.requestCounts[op]
|
||||
errorsCopy[op] = m.requestErrors[op]
|
||||
// Deep copy the latency slice
|
||||
if samples := m.requestLatency[op]; len(samples) > 0 {
|
||||
latCopy[op] = append([]time.Duration(nil), samples...)
|
||||
}
|
||||
}
|
||||
|
||||
m.mu.RUnlock()
|
||||
|
||||
// Compute statistics outside the lock
|
||||
uptime := time.Since(m.startTime)
|
||||
|
||||
// Calculate per-operation stats
|
||||
operations := make([]OperationMetrics, 0, len(opsSet))
|
||||
for op := range opsSet {
|
||||
count := countsCopy[op]
|
||||
errors := errorsCopy[op]
|
||||
samples := latCopy[op]
|
||||
|
||||
// Ensure success count is never negative
|
||||
successCount := count - errors
|
||||
if successCount < 0 {
|
||||
successCount = 0
|
||||
}
|
||||
|
||||
opMetrics := OperationMetrics{
|
||||
Operation: op,
|
||||
TotalCount: count,
|
||||
ErrorCount: errors,
|
||||
SuccessCount: successCount,
|
||||
}
|
||||
|
||||
// Calculate latency percentiles if we have samples
|
||||
if len(samples) > 0 {
|
||||
opMetrics.Latency = calculateLatencyStats(samples)
|
||||
}
|
||||
|
||||
operations = append(operations, opMetrics)
|
||||
}
|
||||
|
||||
// Sort by total count (most frequent first)
|
||||
sort.Slice(operations, func(i, j int) bool {
|
||||
return operations[i].TotalCount > operations[j].TotalCount
|
||||
})
|
||||
|
||||
// Get memory stats
|
||||
var memStats runtime.MemStats
|
||||
runtime.ReadMemStats(&memStats)
|
||||
|
||||
return MetricsSnapshot{
|
||||
Timestamp: time.Now(),
|
||||
UptimeSeconds: uptime.Seconds(),
|
||||
Operations: operations,
|
||||
CacheHits: cacheHits,
|
||||
CacheMisses: cacheMisses,
|
||||
CacheSize: cacheSize,
|
||||
CacheEvictions: atomic.LoadInt64(&m.cacheEvictions),
|
||||
TotalConns: atomic.LoadInt64(&m.totalConns),
|
||||
ActiveConns: activeConns,
|
||||
RejectedConns: atomic.LoadInt64(&m.rejectedConns),
|
||||
MemoryAllocMB: memStats.Alloc / 1024 / 1024,
|
||||
MemorySysMB: memStats.Sys / 1024 / 1024,
|
||||
GoroutineCount: runtime.NumGoroutine(),
|
||||
}
|
||||
}
|
||||
|
||||
// MetricsSnapshot is a point-in-time view of all metrics
|
||||
type MetricsSnapshot struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
UptimeSeconds float64 `json:"uptime_seconds"`
|
||||
Operations []OperationMetrics `json:"operations"`
|
||||
CacheHits int64 `json:"cache_hits"`
|
||||
CacheMisses int64 `json:"cache_misses"`
|
||||
CacheSize int `json:"cache_size"`
|
||||
CacheEvictions int64 `json:"cache_evictions"`
|
||||
TotalConns int64 `json:"total_connections"`
|
||||
ActiveConns int `json:"active_connections"`
|
||||
RejectedConns int64 `json:"rejected_connections"`
|
||||
MemoryAllocMB uint64 `json:"memory_alloc_mb"`
|
||||
MemorySysMB uint64 `json:"memory_sys_mb"`
|
||||
GoroutineCount int `json:"goroutine_count"`
|
||||
}
|
||||
|
||||
// OperationMetrics holds metrics for a single operation type
|
||||
type OperationMetrics struct {
|
||||
Operation string `json:"operation"`
|
||||
TotalCount int64 `json:"total_count"`
|
||||
SuccessCount int64 `json:"success_count"`
|
||||
ErrorCount int64 `json:"error_count"`
|
||||
Latency LatencyStats `json:"latency,omitempty"`
|
||||
}
|
||||
|
||||
// LatencyStats holds latency percentile data in milliseconds
|
||||
type LatencyStats struct {
|
||||
MinMS float64 `json:"min_ms"`
|
||||
P50MS float64 `json:"p50_ms"`
|
||||
P95MS float64 `json:"p95_ms"`
|
||||
P99MS float64 `json:"p99_ms"`
|
||||
MaxMS float64 `json:"max_ms"`
|
||||
AvgMS float64 `json:"avg_ms"`
|
||||
}
|
||||
|
||||
// calculateLatencyStats computes percentiles from latency samples and returns milliseconds
|
||||
func calculateLatencyStats(samples []time.Duration) LatencyStats {
|
||||
if len(samples) == 0 {
|
||||
return LatencyStats{}
|
||||
}
|
||||
|
||||
// Sort samples
|
||||
sorted := make([]time.Duration, len(samples))
|
||||
copy(sorted, samples)
|
||||
sort.Slice(sorted, func(i, j int) bool {
|
||||
return sorted[i] < sorted[j]
|
||||
})
|
||||
|
||||
n := len(sorted)
|
||||
// Calculate percentiles with defensive clamping
|
||||
p50Idx := min(n-1, n*50/100)
|
||||
p95Idx := min(n-1, n*95/100)
|
||||
p99Idx := min(n-1, n*99/100)
|
||||
|
||||
// Calculate average
|
||||
var sum time.Duration
|
||||
for _, d := range sorted {
|
||||
sum += d
|
||||
}
|
||||
avg := sum / time.Duration(n)
|
||||
|
||||
// Convert to milliseconds
|
||||
toMS := func(d time.Duration) float64 {
|
||||
return float64(d) / float64(time.Millisecond)
|
||||
}
|
||||
|
||||
return LatencyStats{
|
||||
MinMS: toMS(sorted[0]),
|
||||
P50MS: toMS(sorted[p50Idx]),
|
||||
P95MS: toMS(sorted[p95Idx]),
|
||||
P99MS: toMS(sorted[p99Idx]),
|
||||
MaxMS: toMS(sorted[n-1]),
|
||||
AvgMS: toMS(avg),
|
||||
}
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
const (
|
||||
OpPing = "ping"
|
||||
OpHealth = "health"
|
||||
OpMetrics = "metrics"
|
||||
OpCreate = "create"
|
||||
OpUpdate = "update"
|
||||
OpClose = "close"
|
||||
|
||||
@@ -53,6 +53,7 @@ type Server struct {
|
||||
startTime time.Time
|
||||
cacheHits int64
|
||||
cacheMisses int64
|
||||
metrics *Metrics
|
||||
// Connection limiting
|
||||
maxConns int
|
||||
activeConns int32 // atomic counter
|
||||
@@ -103,6 +104,7 @@ func NewServer(socketPath string, store storage.Storage) *Server {
|
||||
cacheTTL: cacheTTL,
|
||||
shutdownChan: make(chan struct{}),
|
||||
startTime: time.Now(),
|
||||
metrics: NewMetrics(),
|
||||
maxConns: maxConns,
|
||||
connSemaphore: make(chan struct{}, maxConns),
|
||||
requestTimeout: requestTimeout,
|
||||
@@ -160,6 +162,7 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
select {
|
||||
case s.connSemaphore <- struct{}{}:
|
||||
// Acquired slot, handle connection
|
||||
s.metrics.RecordConnection()
|
||||
go func(c net.Conn) {
|
||||
defer func() { <-s.connSemaphore }() // Release slot
|
||||
atomic.AddInt32(&s.activeConns, 1)
|
||||
@@ -168,6 +171,7 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
}(conn)
|
||||
default:
|
||||
// Max connections reached, reject immediately
|
||||
s.metrics.RecordRejectedConnection()
|
||||
conn.Close()
|
||||
}
|
||||
}
|
||||
@@ -374,6 +378,7 @@ func (s *Server) evictStaleStorage() {
|
||||
for i := 0; i < numToEvict && i < len(items); i++ {
|
||||
toClose = append(toClose, items[i].entry.store)
|
||||
delete(s.storageCache, items[i].path)
|
||||
s.metrics.RecordCacheEviction()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -479,9 +484,19 @@ func (s *Server) checkVersionCompatibility(clientVersion string) error {
|
||||
}
|
||||
|
||||
func (s *Server) handleRequest(req *Request) Response {
|
||||
// Track request timing
|
||||
start := time.Now()
|
||||
|
||||
// Defer metrics recording to ensure it always happens
|
||||
defer func() {
|
||||
latency := time.Since(start)
|
||||
s.metrics.RecordRequest(req.Operation, latency)
|
||||
}()
|
||||
|
||||
// Check version compatibility (skip for ping/health to allow version checks)
|
||||
if req.Operation != OpPing && req.Operation != OpHealth {
|
||||
if err := s.checkVersionCompatibility(req.ClientVersion); err != nil {
|
||||
s.metrics.RecordError(req.Operation)
|
||||
return Response{
|
||||
Success: false,
|
||||
Error: err.Error(),
|
||||
@@ -489,49 +504,60 @@ func (s *Server) handleRequest(req *Request) Response {
|
||||
}
|
||||
}
|
||||
|
||||
var resp Response
|
||||
switch req.Operation {
|
||||
case OpPing:
|
||||
return s.handlePing(req)
|
||||
resp = s.handlePing(req)
|
||||
case OpHealth:
|
||||
return s.handleHealth(req)
|
||||
resp = s.handleHealth(req)
|
||||
case OpMetrics:
|
||||
resp = s.handleMetrics(req)
|
||||
case OpCreate:
|
||||
return s.handleCreate(req)
|
||||
resp = s.handleCreate(req)
|
||||
case OpUpdate:
|
||||
return s.handleUpdate(req)
|
||||
resp = s.handleUpdate(req)
|
||||
case OpClose:
|
||||
return s.handleClose(req)
|
||||
resp = s.handleClose(req)
|
||||
case OpList:
|
||||
return s.handleList(req)
|
||||
resp = s.handleList(req)
|
||||
case OpShow:
|
||||
return s.handleShow(req)
|
||||
resp = s.handleShow(req)
|
||||
case OpReady:
|
||||
return s.handleReady(req)
|
||||
resp = s.handleReady(req)
|
||||
case OpStats:
|
||||
return s.handleStats(req)
|
||||
resp = s.handleStats(req)
|
||||
case OpDepAdd:
|
||||
return s.handleDepAdd(req)
|
||||
resp = s.handleDepAdd(req)
|
||||
case OpDepRemove:
|
||||
return s.handleDepRemove(req)
|
||||
resp = s.handleDepRemove(req)
|
||||
case OpLabelAdd:
|
||||
return s.handleLabelAdd(req)
|
||||
resp = s.handleLabelAdd(req)
|
||||
case OpLabelRemove:
|
||||
return s.handleLabelRemove(req)
|
||||
resp = s.handleLabelRemove(req)
|
||||
case OpBatch:
|
||||
return s.handleBatch(req)
|
||||
resp = s.handleBatch(req)
|
||||
case OpReposList:
|
||||
return s.handleReposList(req)
|
||||
resp = s.handleReposList(req)
|
||||
case OpReposReady:
|
||||
return s.handleReposReady(req)
|
||||
resp = s.handleReposReady(req)
|
||||
case OpReposStats:
|
||||
return s.handleReposStats(req)
|
||||
resp = s.handleReposStats(req)
|
||||
case OpReposClearCache:
|
||||
return s.handleReposClearCache(req)
|
||||
resp = s.handleReposClearCache(req)
|
||||
default:
|
||||
s.metrics.RecordError(req.Operation)
|
||||
return Response{
|
||||
Success: false,
|
||||
Error: fmt.Sprintf("unknown operation: %s", req.Operation),
|
||||
}
|
||||
}
|
||||
|
||||
// Record error if request failed
|
||||
if !resp.Success {
|
||||
s.metrics.RecordError(req.Operation)
|
||||
}
|
||||
|
||||
return resp
|
||||
}
|
||||
|
||||
// Adapter helpers
|
||||
@@ -676,6 +702,25 @@ func (s *Server) handleHealth(req *Request) Response {
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) handleMetrics(_ *Request) Response {
|
||||
s.cacheMu.RLock()
|
||||
cacheSize := len(s.storageCache)
|
||||
s.cacheMu.RUnlock()
|
||||
|
||||
snapshot := s.metrics.Snapshot(
|
||||
atomic.LoadInt64(&s.cacheHits),
|
||||
atomic.LoadInt64(&s.cacheMisses),
|
||||
cacheSize,
|
||||
int(atomic.LoadInt32(&s.activeConns)),
|
||||
)
|
||||
|
||||
data, _ := json.Marshal(snapshot)
|
||||
return Response{
|
||||
Success: true,
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) handleCreate(req *Request) Response {
|
||||
var createArgs CreateArgs
|
||||
if err := json.Unmarshal(req.Args, &createArgs); err != nil {
|
||||
|
||||
Reference in New Issue
Block a user