Add daemon health check endpoint (bd-146)
- Add OpHealth RPC operation to protocol - Implement handleHealth() with DB ping and 1s timeout - Returns status (healthy/degraded/unhealthy), uptime, cache metrics - Update TryConnect() to use health check instead of ping - Add 'bd daemon --health' CLI command with JSON output - Track cache hits/misses for metrics - Unhealthy daemon triggers automatic fallback to direct mode - Health check completes in <2 seconds Amp-Thread-ID: https://ampcode.com/threads/T-1a4889f3-77cf-433a-a704-e1c383929f48 Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
@@ -17,7 +17,7 @@ type Client struct {
|
||||
}
|
||||
|
||||
// TryConnect attempts to connect to the daemon socket
|
||||
// Returns nil if no daemon is running
|
||||
// Returns nil if no daemon is running or unhealthy
|
||||
func TryConnect(socketPath string) (*Client, error) {
|
||||
if _, err := os.Stat(socketPath); os.IsNotExist(err) {
|
||||
if os.Getenv("BD_DEBUG") != "" {
|
||||
@@ -40,14 +40,28 @@ func TryConnect(socketPath string) (*Client, error) {
|
||||
timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
if err := client.Ping(); err != nil {
|
||||
health, err := client.Health()
|
||||
if err != nil {
|
||||
if os.Getenv("BD_DEBUG") != "" {
|
||||
fmt.Fprintf(os.Stderr, "Debug: ping failed: %v\n", err)
|
||||
fmt.Fprintf(os.Stderr, "Debug: health check failed: %v\n", err)
|
||||
}
|
||||
conn.Close()
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if health.Status == "unhealthy" {
|
||||
if os.Getenv("BD_DEBUG") != "" {
|
||||
fmt.Fprintf(os.Stderr, "Debug: daemon unhealthy: %s\n", health.Error)
|
||||
}
|
||||
conn.Close()
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if os.Getenv("BD_DEBUG") != "" {
|
||||
fmt.Fprintf(os.Stderr, "Debug: connected to daemon (status: %s, uptime: %.1fs, cache: %d)\n",
|
||||
health.Status, health.Uptime, health.CacheSize)
|
||||
}
|
||||
|
||||
return client, nil
|
||||
}
|
||||
|
||||
@@ -131,6 +145,21 @@ func (c *Client) Ping() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Health sends a health check request to verify the daemon is healthy
|
||||
func (c *Client) Health() (*HealthResponse, error) {
|
||||
resp, err := c.Execute(OpHealth, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var health HealthResponse
|
||||
if err := json.Unmarshal(resp.Data, &health); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal health response: %w", err)
|
||||
}
|
||||
|
||||
return &health, nil
|
||||
}
|
||||
|
||||
// Create creates a new issue via the daemon
|
||||
func (c *Client) Create(args *CreateArgs) (*Response, error) {
|
||||
return c.Execute(OpCreate, args)
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
// Operation constants for all bd commands
|
||||
const (
|
||||
OpPing = "ping"
|
||||
OpHealth = "health"
|
||||
OpCreate = "create"
|
||||
OpUpdate = "update"
|
||||
OpClose = "close"
|
||||
@@ -137,6 +138,18 @@ type PingResponse struct {
|
||||
Version string `json:"version"`
|
||||
}
|
||||
|
||||
// HealthResponse is the response for a health check operation
|
||||
type HealthResponse struct {
|
||||
Status string `json:"status"` // "healthy", "degraded", "unhealthy"
|
||||
Version string `json:"version"`
|
||||
Uptime float64 `json:"uptime_seconds"`
|
||||
CacheSize int `json:"cache_size"`
|
||||
CacheHits int64 `json:"cache_hits"`
|
||||
CacheMisses int64 `json:"cache_misses"`
|
||||
DBResponseTime float64 `json:"db_response_ms"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// BatchArgs represents arguments for batch operations
|
||||
type BatchArgs struct {
|
||||
Operations []BatchOperation `json:"operations"`
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
@@ -40,6 +41,10 @@ type Server struct {
|
||||
maxCacheSize int
|
||||
cacheTTL time.Duration
|
||||
cleanupTicker *time.Ticker
|
||||
// Health and metrics
|
||||
startTime time.Time
|
||||
cacheHits int64
|
||||
cacheMisses int64
|
||||
}
|
||||
|
||||
// NewServer creates a new RPC server
|
||||
@@ -68,6 +73,7 @@ func NewServer(socketPath string, store storage.Storage) *Server {
|
||||
maxCacheSize: maxCacheSize,
|
||||
cacheTTL: cacheTTL,
|
||||
shutdownChan: make(chan struct{}),
|
||||
startTime: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -274,6 +280,8 @@ func (s *Server) handleRequest(req *Request) Response {
|
||||
switch req.Operation {
|
||||
case OpPing:
|
||||
return s.handlePing(req)
|
||||
case OpHealth:
|
||||
return s.handleHealth(req)
|
||||
case OpCreate:
|
||||
return s.handleCreate(req)
|
||||
case OpUpdate:
|
||||
@@ -379,6 +387,66 @@ func (s *Server) handlePing(_ *Request) Response {
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) handleHealth(req *Request) Response {
|
||||
start := time.Now()
|
||||
|
||||
store, err := s.getStorageForRequest(req)
|
||||
if err != nil {
|
||||
data, _ := json.Marshal(HealthResponse{
|
||||
Status: "unhealthy",
|
||||
Version: "0.9.8",
|
||||
Uptime: time.Since(s.startTime).Seconds(),
|
||||
Error: fmt.Sprintf("storage error: %v", err),
|
||||
})
|
||||
return Response{
|
||||
Success: false,
|
||||
Data: data,
|
||||
Error: fmt.Sprintf("storage error: %v", err),
|
||||
}
|
||||
}
|
||||
|
||||
healthCtx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
|
||||
defer cancel()
|
||||
|
||||
status := "healthy"
|
||||
dbError := ""
|
||||
|
||||
_, pingErr := store.GetStatistics(healthCtx)
|
||||
dbResponseMs := time.Since(start).Seconds() * 1000
|
||||
|
||||
if pingErr != nil {
|
||||
status = "unhealthy"
|
||||
dbError = pingErr.Error()
|
||||
} else if dbResponseMs > 500 {
|
||||
status = "degraded"
|
||||
}
|
||||
|
||||
s.cacheMu.RLock()
|
||||
cacheSize := len(s.storageCache)
|
||||
s.cacheMu.RUnlock()
|
||||
|
||||
health := HealthResponse{
|
||||
Status: status,
|
||||
Version: "0.9.8",
|
||||
Uptime: time.Since(s.startTime).Seconds(),
|
||||
CacheSize: cacheSize,
|
||||
CacheHits: atomic.LoadInt64(&s.cacheHits),
|
||||
CacheMisses: atomic.LoadInt64(&s.cacheMisses),
|
||||
DBResponseTime: dbResponseMs,
|
||||
}
|
||||
|
||||
if dbError != "" {
|
||||
health.Error = dbError
|
||||
}
|
||||
|
||||
data, _ := json.Marshal(health)
|
||||
return Response{
|
||||
Success: status != "unhealthy",
|
||||
Data: data,
|
||||
Error: dbError,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) handleCreate(req *Request) Response {
|
||||
var createArgs CreateArgs
|
||||
if err := json.Unmarshal(req.Args, &createArgs); err != nil {
|
||||
@@ -849,8 +917,11 @@ func (s *Server) getStorageForRequest(req *Request) (storage.Storage, error) {
|
||||
if entry, ok := s.storageCache[repoRoot]; ok {
|
||||
// Update last access time (safe under Lock)
|
||||
entry.lastAccess = time.Now()
|
||||
atomic.AddInt64(&s.cacheHits, 1)
|
||||
return entry.store, nil
|
||||
}
|
||||
|
||||
atomic.AddInt64(&s.cacheMisses, 1)
|
||||
|
||||
// Open storage
|
||||
store, err := sqlite.New(dbPath)
|
||||
|
||||
Reference in New Issue
Block a user