Add daemon health check endpoint (bd-146)

- Add OpHealth RPC operation to protocol
- Implement handleHealth() with DB ping and 1s timeout
- Returns status (healthy/degraded/unhealthy), uptime, cache metrics
- Update TryConnect() to use health check instead of ping
- Add 'bd daemon --health' CLI command with JSON output
- Track cache hits/misses for metrics
- Unhealthy daemon triggers automatic fallback to direct mode
- Health check completes in <2 seconds

Amp-Thread-ID: https://ampcode.com/threads/T-1a4889f3-77cf-433a-a704-e1c383929f48
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Steve Yegge
2025-10-18 13:41:06 -07:00
parent f987722f96
commit 9e2ee1889f
6 changed files with 199 additions and 5 deletions

View File

@@ -17,7 +17,7 @@ type Client struct {
}
// TryConnect attempts to connect to the daemon socket
// Returns nil if no daemon is running
// Returns nil if no daemon is running or unhealthy
func TryConnect(socketPath string) (*Client, error) {
if _, err := os.Stat(socketPath); os.IsNotExist(err) {
if os.Getenv("BD_DEBUG") != "" {
@@ -40,14 +40,28 @@ func TryConnect(socketPath string) (*Client, error) {
timeout: 30 * time.Second,
}
if err := client.Ping(); err != nil {
health, err := client.Health()
if err != nil {
if os.Getenv("BD_DEBUG") != "" {
fmt.Fprintf(os.Stderr, "Debug: ping failed: %v\n", err)
fmt.Fprintf(os.Stderr, "Debug: health check failed: %v\n", err)
}
conn.Close()
return nil, nil
}
if health.Status == "unhealthy" {
if os.Getenv("BD_DEBUG") != "" {
fmt.Fprintf(os.Stderr, "Debug: daemon unhealthy: %s\n", health.Error)
}
conn.Close()
return nil, nil
}
if os.Getenv("BD_DEBUG") != "" {
fmt.Fprintf(os.Stderr, "Debug: connected to daemon (status: %s, uptime: %.1fs, cache: %d)\n",
health.Status, health.Uptime, health.CacheSize)
}
return client, nil
}
@@ -131,6 +145,21 @@ func (c *Client) Ping() error {
return nil
}
// Health sends a health check request to verify the daemon is healthy
func (c *Client) Health() (*HealthResponse, error) {
resp, err := c.Execute(OpHealth, nil)
if err != nil {
return nil, err
}
var health HealthResponse
if err := json.Unmarshal(resp.Data, &health); err != nil {
return nil, fmt.Errorf("failed to unmarshal health response: %w", err)
}
return &health, nil
}
// Create creates a new issue via the daemon
func (c *Client) Create(args *CreateArgs) (*Response, error) {
return c.Execute(OpCreate, args)

View File

@@ -9,6 +9,7 @@ import (
// Operation constants for all bd commands
const (
OpPing = "ping"
OpHealth = "health"
OpCreate = "create"
OpUpdate = "update"
OpClose = "close"
@@ -137,6 +138,18 @@ type PingResponse struct {
Version string `json:"version"`
}
// HealthResponse is the response for a health check operation
type HealthResponse struct {
Status string `json:"status"` // "healthy", "degraded", "unhealthy"
Version string `json:"version"`
Uptime float64 `json:"uptime_seconds"`
CacheSize int `json:"cache_size"`
CacheHits int64 `json:"cache_hits"`
CacheMisses int64 `json:"cache_misses"`
DBResponseTime float64 `json:"db_response_ms"`
Error string `json:"error,omitempty"`
}
// BatchArgs represents arguments for batch operations
type BatchArgs struct {
Operations []BatchOperation `json:"operations"`

View File

@@ -11,6 +11,7 @@ import (
"path/filepath"
"sort"
"sync"
"sync/atomic"
"syscall"
"time"
@@ -40,6 +41,10 @@ type Server struct {
maxCacheSize int
cacheTTL time.Duration
cleanupTicker *time.Ticker
// Health and metrics
startTime time.Time
cacheHits int64
cacheMisses int64
}
// NewServer creates a new RPC server
@@ -68,6 +73,7 @@ func NewServer(socketPath string, store storage.Storage) *Server {
maxCacheSize: maxCacheSize,
cacheTTL: cacheTTL,
shutdownChan: make(chan struct{}),
startTime: time.Now(),
}
}
@@ -274,6 +280,8 @@ func (s *Server) handleRequest(req *Request) Response {
switch req.Operation {
case OpPing:
return s.handlePing(req)
case OpHealth:
return s.handleHealth(req)
case OpCreate:
return s.handleCreate(req)
case OpUpdate:
@@ -379,6 +387,66 @@ func (s *Server) handlePing(_ *Request) Response {
}
}
func (s *Server) handleHealth(req *Request) Response {
start := time.Now()
store, err := s.getStorageForRequest(req)
if err != nil {
data, _ := json.Marshal(HealthResponse{
Status: "unhealthy",
Version: "0.9.8",
Uptime: time.Since(s.startTime).Seconds(),
Error: fmt.Sprintf("storage error: %v", err),
})
return Response{
Success: false,
Data: data,
Error: fmt.Sprintf("storage error: %v", err),
}
}
healthCtx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel()
status := "healthy"
dbError := ""
_, pingErr := store.GetStatistics(healthCtx)
dbResponseMs := time.Since(start).Seconds() * 1000
if pingErr != nil {
status = "unhealthy"
dbError = pingErr.Error()
} else if dbResponseMs > 500 {
status = "degraded"
}
s.cacheMu.RLock()
cacheSize := len(s.storageCache)
s.cacheMu.RUnlock()
health := HealthResponse{
Status: status,
Version: "0.9.8",
Uptime: time.Since(s.startTime).Seconds(),
CacheSize: cacheSize,
CacheHits: atomic.LoadInt64(&s.cacheHits),
CacheMisses: atomic.LoadInt64(&s.cacheMisses),
DBResponseTime: dbResponseMs,
}
if dbError != "" {
health.Error = dbError
}
data, _ := json.Marshal(health)
return Response{
Success: status != "unhealthy",
Data: data,
Error: dbError,
}
}
func (s *Server) handleCreate(req *Request) Response {
var createArgs CreateArgs
if err := json.Unmarshal(req.Args, &createArgs); err != nil {
@@ -849,8 +917,11 @@ func (s *Server) getStorageForRequest(req *Request) (storage.Storage, error) {
if entry, ok := s.storageCache[repoRoot]; ok {
// Update last access time (safe under Lock)
entry.lastAccess = time.Now()
atomic.AddInt64(&s.cacheHits, 1)
return entry.store, nil
}
atomic.AddInt64(&s.cacheMisses, 1)
// Open storage
store, err := sqlite.New(dbPath)