Add telemetry and observability to daemon (bd-153)
Implement comprehensive metrics collection for the daemon with zero-overhead design: Features: - Request metrics: counts, latency percentiles (p50, p95, p99), error rates - Cache metrics: hit/miss ratios, eviction counts, database connections - Connection metrics: total, active, rejected connections - System metrics: memory usage, goroutine count, uptime Implementation: - New internal/rpc/metrics.go with Metrics collector - OpMetrics RPC operation for programmatic access - 'bd daemon --metrics' command (human-readable and JSON output) - Lock-free atomic operations for cache/connection metrics - Copy-and-compute pattern in Snapshot to minimize lock contention - Deferred metrics recording ensures all requests are tracked Improvements from code review: - JSON types use float64 for ms/seconds (not time.Duration) - Snapshot copies data under short lock, computes outside - Union of operations from counts and errors maps - Defensive clamping in percentile calculation - Defer pattern ensures metrics recorded even on early returns Documentation updated in README.md with usage examples. Closes bd-153 Amp-Thread-ID: https://ampcode.com/threads/T-20213187-65c7-47f7-ba21-5234c9e52e26 Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
@@ -53,6 +53,7 @@ type Server struct {
|
||||
startTime time.Time
|
||||
cacheHits int64
|
||||
cacheMisses int64
|
||||
metrics *Metrics
|
||||
// Connection limiting
|
||||
maxConns int
|
||||
activeConns int32 // atomic counter
|
||||
@@ -103,6 +104,7 @@ func NewServer(socketPath string, store storage.Storage) *Server {
|
||||
cacheTTL: cacheTTL,
|
||||
shutdownChan: make(chan struct{}),
|
||||
startTime: time.Now(),
|
||||
metrics: NewMetrics(),
|
||||
maxConns: maxConns,
|
||||
connSemaphore: make(chan struct{}, maxConns),
|
||||
requestTimeout: requestTimeout,
|
||||
@@ -160,6 +162,7 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
select {
|
||||
case s.connSemaphore <- struct{}{}:
|
||||
// Acquired slot, handle connection
|
||||
s.metrics.RecordConnection()
|
||||
go func(c net.Conn) {
|
||||
defer func() { <-s.connSemaphore }() // Release slot
|
||||
atomic.AddInt32(&s.activeConns, 1)
|
||||
@@ -168,6 +171,7 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
}(conn)
|
||||
default:
|
||||
// Max connections reached, reject immediately
|
||||
s.metrics.RecordRejectedConnection()
|
||||
conn.Close()
|
||||
}
|
||||
}
|
||||
@@ -374,6 +378,7 @@ func (s *Server) evictStaleStorage() {
|
||||
for i := 0; i < numToEvict && i < len(items); i++ {
|
||||
toClose = append(toClose, items[i].entry.store)
|
||||
delete(s.storageCache, items[i].path)
|
||||
s.metrics.RecordCacheEviction()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -479,9 +484,19 @@ func (s *Server) checkVersionCompatibility(clientVersion string) error {
|
||||
}
|
||||
|
||||
func (s *Server) handleRequest(req *Request) Response {
|
||||
// Track request timing
|
||||
start := time.Now()
|
||||
|
||||
// Defer metrics recording to ensure it always happens
|
||||
defer func() {
|
||||
latency := time.Since(start)
|
||||
s.metrics.RecordRequest(req.Operation, latency)
|
||||
}()
|
||||
|
||||
// Check version compatibility (skip for ping/health to allow version checks)
|
||||
if req.Operation != OpPing && req.Operation != OpHealth {
|
||||
if err := s.checkVersionCompatibility(req.ClientVersion); err != nil {
|
||||
s.metrics.RecordError(req.Operation)
|
||||
return Response{
|
||||
Success: false,
|
||||
Error: err.Error(),
|
||||
@@ -489,49 +504,60 @@ func (s *Server) handleRequest(req *Request) Response {
|
||||
}
|
||||
}
|
||||
|
||||
var resp Response
|
||||
switch req.Operation {
|
||||
case OpPing:
|
||||
return s.handlePing(req)
|
||||
resp = s.handlePing(req)
|
||||
case OpHealth:
|
||||
return s.handleHealth(req)
|
||||
resp = s.handleHealth(req)
|
||||
case OpMetrics:
|
||||
resp = s.handleMetrics(req)
|
||||
case OpCreate:
|
||||
return s.handleCreate(req)
|
||||
resp = s.handleCreate(req)
|
||||
case OpUpdate:
|
||||
return s.handleUpdate(req)
|
||||
resp = s.handleUpdate(req)
|
||||
case OpClose:
|
||||
return s.handleClose(req)
|
||||
resp = s.handleClose(req)
|
||||
case OpList:
|
||||
return s.handleList(req)
|
||||
resp = s.handleList(req)
|
||||
case OpShow:
|
||||
return s.handleShow(req)
|
||||
resp = s.handleShow(req)
|
||||
case OpReady:
|
||||
return s.handleReady(req)
|
||||
resp = s.handleReady(req)
|
||||
case OpStats:
|
||||
return s.handleStats(req)
|
||||
resp = s.handleStats(req)
|
||||
case OpDepAdd:
|
||||
return s.handleDepAdd(req)
|
||||
resp = s.handleDepAdd(req)
|
||||
case OpDepRemove:
|
||||
return s.handleDepRemove(req)
|
||||
resp = s.handleDepRemove(req)
|
||||
case OpLabelAdd:
|
||||
return s.handleLabelAdd(req)
|
||||
resp = s.handleLabelAdd(req)
|
||||
case OpLabelRemove:
|
||||
return s.handleLabelRemove(req)
|
||||
resp = s.handleLabelRemove(req)
|
||||
case OpBatch:
|
||||
return s.handleBatch(req)
|
||||
resp = s.handleBatch(req)
|
||||
case OpReposList:
|
||||
return s.handleReposList(req)
|
||||
resp = s.handleReposList(req)
|
||||
case OpReposReady:
|
||||
return s.handleReposReady(req)
|
||||
resp = s.handleReposReady(req)
|
||||
case OpReposStats:
|
||||
return s.handleReposStats(req)
|
||||
resp = s.handleReposStats(req)
|
||||
case OpReposClearCache:
|
||||
return s.handleReposClearCache(req)
|
||||
resp = s.handleReposClearCache(req)
|
||||
default:
|
||||
s.metrics.RecordError(req.Operation)
|
||||
return Response{
|
||||
Success: false,
|
||||
Error: fmt.Sprintf("unknown operation: %s", req.Operation),
|
||||
}
|
||||
}
|
||||
|
||||
// Record error if request failed
|
||||
if !resp.Success {
|
||||
s.metrics.RecordError(req.Operation)
|
||||
}
|
||||
|
||||
return resp
|
||||
}
|
||||
|
||||
// Adapter helpers
|
||||
@@ -676,6 +702,25 @@ func (s *Server) handleHealth(req *Request) Response {
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) handleMetrics(_ *Request) Response {
|
||||
s.cacheMu.RLock()
|
||||
cacheSize := len(s.storageCache)
|
||||
s.cacheMu.RUnlock()
|
||||
|
||||
snapshot := s.metrics.Snapshot(
|
||||
atomic.LoadInt64(&s.cacheHits),
|
||||
atomic.LoadInt64(&s.cacheMisses),
|
||||
cacheSize,
|
||||
int(atomic.LoadInt32(&s.activeConns)),
|
||||
)
|
||||
|
||||
data, _ := json.Marshal(snapshot)
|
||||
return Response{
|
||||
Success: true,
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) handleCreate(req *Request) Response {
|
||||
var createArgs CreateArgs
|
||||
if err := json.Unmarshal(req.Args, &createArgs); err != nil {
|
||||
|
||||
Reference in New Issue
Block a user