Add storage cache eviction policy to daemon (bd-145)

Implemented TTL-based and LRU cache eviction for daemon storage connections: - Add StorageCacheEntry with lastAccess timestamp tracking - Cleanup goroutine runs every 5 minutes to evict stale entries - TTL-based eviction: remove entries idle >30min (configurable) - LRU eviction: enforce max cache size (default: 50 repos) - Configurable via BEADS_DAEMON_MAX_CACHE_SIZE and BEADS_DAEMON_CACHE_TTL - Proper cleanup on server shutdown - Update lastAccess on cache hits - Comprehensive tests for eviction logic Fixes memory leaks and file descriptor exhaustion for multi-repo users. Amp-Thread-ID: https://ampcode.com/threads/T-1148d8b3-b8a8-45fc-af9c-b5be14c4834d Co-authored-by: Amp <amp@ampcode.com>
2025-10-18 13:17:07 -07:00
parent 491cb82489
commit 259e994522
3 changed files with 460 additions and 32 deletions
--- a/internal/rpc/server.go
+++ b/internal/rpc/server.go
@@ -18,24 +18,54 @@ import (
 	"github.com/steveyegge/beads/internal/types"
 )

+// StorageCacheEntry holds a cached storage with metadata for eviction
+type StorageCacheEntry struct {
+	store      storage.Storage
+	lastAccess time.Time
+}
+
 // Server represents the RPC server that runs in the daemon
 type Server struct {
-	socketPath string
-	storage    storage.Storage // Default storage (for backward compat)
-	listener   net.Listener
-	mu         sync.RWMutex
-	shutdown   bool
-	// Per-request storage routing
-	storageCache map[string]storage.Storage // path -> storage
-	cacheMu      sync.RWMutex
+	socketPath   string
+	storage      storage.Storage // Default storage (for backward compat)
+	listener     net.Listener
+	mu           sync.RWMutex
+	shutdown     bool
+	shutdownChan chan struct{}
+	// Per-request storage routing with eviction support
+	storageCache   map[string]*StorageCacheEntry // path -> entry
+	cacheMu        sync.RWMutex
+	maxCacheSize   int
+	cacheTTL       time.Duration
+	cleanupTicker  *time.Ticker
 }

 // NewServer creates a new RPC server
 func NewServer(socketPath string, store storage.Storage) *Server {
+	// Parse config from env vars
+	maxCacheSize := 50 // default
+	if env := os.Getenv("BEADS_DAEMON_MAX_CACHE_SIZE"); env != "" {
+		// Parse as integer
+		var size int
+		if _, err := fmt.Sscanf(env, "%d", &size); err == nil && size > 0 {
+			maxCacheSize = size
+		}
+	}
+	
+	cacheTTL := 30 * time.Minute // default
+	if env := os.Getenv("BEADS_DAEMON_CACHE_TTL"); env != "" {
+		if ttl, err := time.ParseDuration(env); err == nil {
+			cacheTTL = ttl
+		}
+	}
+
 	return &Server{
 		socketPath:   socketPath,
 		storage:      store,
-		storageCache: make(map[string]storage.Storage),
+		storageCache: make(map[string]*StorageCacheEntry),
+		maxCacheSize: maxCacheSize,
+		cacheTTL:     cacheTTL,
+		shutdownChan: make(chan struct{}),
 	}
 }

@@ -62,6 +92,7 @@ func (s *Server) Start(ctx context.Context) error {
 	}

 	go s.handleSignals()
+	go s.runCleanupLoop()

 	for {
 		conn, err := s.listener.Accept()
@@ -85,6 +116,23 @@ func (s *Server) Stop() error {
 	s.shutdown = true
 	s.mu.Unlock()

+	// Signal cleanup goroutine to stop
+	close(s.shutdownChan)
+	
+	if s.cleanupTicker != nil {
+		s.cleanupTicker.Stop()
+	}
+
+	// Close all cached storage connections
+	s.cacheMu.Lock()
+	for _, entry := range s.storageCache {
+		if err := entry.store.Close(); err != nil {
+			fmt.Fprintf(os.Stderr, "Warning: failed to close storage: %v\n", err)
+		}
+	}
+	s.storageCache = make(map[string]*StorageCacheEntry)
+	s.cacheMu.Unlock()
+
 	if s.listener != nil {
 		if err := s.listener.Close(); err != nil {
 			return fmt.Errorf("failed to close listener: %w", err)
@@ -122,6 +170,76 @@ func (s *Server) handleSignals() {
 	s.Stop()
 }

+// runCleanupLoop periodically evicts stale storage connections
+func (s *Server) runCleanupLoop() {
+	s.cleanupTicker = time.NewTicker(5 * time.Minute)
+	defer s.cleanupTicker.Stop()
+
+	for {
+		select {
+		case <-s.cleanupTicker.C:
+			s.evictStaleStorage()
+		case <-s.shutdownChan:
+			return
+		}
+	}
+}
+
+// evictStaleStorage removes idle connections and enforces cache size limits
+func (s *Server) evictStaleStorage() {
+	now := time.Now()
+	toClose := []storage.Storage{}
+	
+	s.cacheMu.Lock()
+	defer s.cacheMu.Unlock()
+
+	// First pass: evict TTL-expired entries
+	for path, entry := range s.storageCache {
+		if now.Sub(entry.lastAccess) > s.cacheTTL {
+			toClose = append(toClose, entry.store)
+			delete(s.storageCache, path)
+		}
+	}
+
+	// Second pass: enforce max cache size with LRU eviction
+	if len(s.storageCache) > s.maxCacheSize {
+		// Build sorted list of entries by lastAccess
+		type cacheItem struct {
+			path       string
+			entry      *StorageCacheEntry
+		}
+		items := make([]cacheItem, 0, len(s.storageCache))
+		for path, entry := range s.storageCache {
+			items = append(items, cacheItem{path, entry})
+		}
+
+		// Sort by lastAccess (oldest first)
+		for i := 0; i < len(items)-1; i++ {
+			for j := i + 1; j < len(items); j++ {
+				if items[i].entry.lastAccess.After(items[j].entry.lastAccess) {
+					items[i], items[j] = items[j], items[i]
+				}
+			}
+		}
+
+		// Evict oldest entries until we're under the limit
+		numToEvict := len(s.storageCache) - s.maxCacheSize
+		for i := 0; i < numToEvict && i < len(items); i++ {
+			toClose = append(toClose, items[i].entry.store)
+			delete(s.storageCache, items[i].path)
+		}
+	}
+
+	// Close connections outside of lock to avoid blocking
+	go func() {
+		for _, store := range toClose {
+			if err := store.Close(); err != nil {
+				fmt.Fprintf(os.Stderr, "Warning: failed to close evicted storage: %v\n", err)
+			}
+		}
+	}()
+}
+
 func (s *Server) handleConnection(conn net.Conn) {
 	defer conn.Close()

@@ -712,11 +830,13 @@ func (s *Server) getStorageForRequest(req *Request) (storage.Storage, error) {
 	}

 	// Check cache first
-	s.cacheMu.RLock()
-	cached, ok := s.storageCache[req.Cwd]
-	s.cacheMu.RUnlock()
-	if ok {
-		return cached, nil
+	s.cacheMu.Lock()
+	defer s.cacheMu.Unlock()
+	
+	if entry, ok := s.storageCache[req.Cwd]; ok {
+		// Update last access time
+		entry.lastAccess = time.Now()
+		return entry.store, nil
 	}

 	// Find database for this cwd
@@ -731,10 +851,11 @@ func (s *Server) getStorageForRequest(req *Request) (storage.Storage, error) {
 		return nil, fmt.Errorf("failed to open database at %s: %w", dbPath, err)
 	}

-	// Cache it
-	s.cacheMu.Lock()
-	s.storageCache[req.Cwd] = store
-	s.cacheMu.Unlock()
+	// Cache it with current timestamp
+	s.storageCache[req.Cwd] = &StorageCacheEntry{
+		store:      store,
+		lastAccess: time.Now(),
+	}

 	return store, nil
 }
@@ -784,9 +905,9 @@ func (s *Server) handleReposList(_ *Request) Response {
 	defer s.cacheMu.RUnlock()

 	repos := make([]RepoInfo, 0, len(s.storageCache))
-	for path, store := range s.storageCache {
+	for path, entry := range s.storageCache {
 		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-		stats, err := store.GetStatistics(ctx)
+		stats, err := entry.store.GetStatistics(ctx)
 		cancel()
 		if err != nil {
 			continue
@@ -795,7 +916,7 @@ func (s *Server) handleReposList(_ *Request) Response {
 		// Extract prefix from a sample issue
 		filter := types.IssueFilter{Limit: 1}
 		ctx2, cancel2 := context.WithTimeout(context.Background(), 1*time.Second)
-		issues, err := store.SearchIssues(ctx2, "", filter)
+		issues, err := entry.store.SearchIssues(ctx2, "", filter)
 		cancel2()
 		prefix := ""
 		if err == nil && len(issues) > 0 && len(issues[0].ID) > 0 {
@@ -813,7 +934,7 @@ func (s *Server) handleReposList(_ *Request) Response {
 			Path:       path,
 			Prefix:     prefix,
 			IssueCount: stats.TotalIssues,
-			LastAccess: "active",
+			LastAccess: entry.lastAccess.Format(time.RFC3339),
 		})
 	}

@@ -839,7 +960,7 @@ func (s *Server) handleReposReady(req *Request) Response {

 	if args.GroupByRepo {
 		result := make([]RepoReadyWork, 0, len(s.storageCache))
-		for path, store := range s.storageCache {
+		for path, entry := range s.storageCache {
 			filter := types.WorkFilter{
 				Status: types.StatusOpen,
 				Limit:  args.Limit,
@@ -852,7 +973,7 @@ func (s *Server) handleReposReady(req *Request) Response {
 			}

 			ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-			issues, err := store.GetReadyWork(ctx, filter)
+			issues, err := entry.store.GetReadyWork(ctx, filter)
 			cancel()
 			if err != nil || len(issues) == 0 {
 				continue
@@ -873,7 +994,7 @@ func (s *Server) handleReposReady(req *Request) Response {

 	// Flat list of all ready issues across all repos
 	allIssues := make([]ReposReadyIssue, 0)
-	for path, store := range s.storageCache {
+	for path, entry := range s.storageCache {
 		filter := types.WorkFilter{
 			Status: types.StatusOpen,
 			Limit:  args.Limit,
@@ -886,7 +1007,7 @@ func (s *Server) handleReposReady(req *Request) Response {
 		}

 		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-		issues, err := store.GetReadyWork(ctx, filter)
+		issues, err := entry.store.GetReadyWork(ctx, filter)
 		cancel()
 		if err != nil {
 			continue
@@ -916,9 +1037,9 @@ func (s *Server) handleReposStats(_ *Request) Response {
 	perRepo := make(map[string]types.Statistics)
 	errors := make(map[string]string)

-	for path, store := range s.storageCache {
+	for path, entry := range s.storageCache {
 		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-		stats, err := store.GetStatistics(ctx)
+		stats, err := entry.store.GetStatistics(ctx)
 		cancel()
 		if err != nil {
 			errors[path] = err.Error()
@@ -957,10 +1078,10 @@ func (s *Server) handleReposClearCache(_ *Request) Response {
 	// to avoid holding lock during potentially slow Close() operations
 	s.cacheMu.Lock()
 	stores := make([]storage.Storage, 0, len(s.storageCache))
-	for _, store := range s.storageCache {
-		stores = append(stores, store)
+	for _, entry := range s.storageCache {
+		stores = append(stores, entry.store)
 	}
-	s.storageCache = make(map[string]storage.Storage)
+	s.storageCache = make(map[string]*StorageCacheEntry)
 	s.cacheMu.Unlock()

 	// Close all storage connections without holding lock