Fix race condition in TestSocketCleanup by protecting listener access with mutex

Fixes bd-160 The race was between Start() writing s.listener and Stop() reading it. Now all listener access is protected by the server mutex: - Start() stores listener under lock after creation - Accept loop reads listener under RLock - Stop() closes listener under lock All RPC tests now pass with -race flag.
2025-10-19 09:14:37 -07:00
parent 91948f7a2b
commit 5aa7658433
4 changed files with 54 additions and 11 deletions
@@ -63,9 +63,10 @@
 {"id":"bd-155","title":"Daemon production readiness","description":"Make beads daemon production-ready for long-running use, multi-repo deployments, and resilient operation.\n\nCurrent state: Good foundation, works well for development\nTarget state: Production-ready for individual developers and small teams\n\nGap areas:\n1. Resource management (cache eviction, limits)\n2. Health monitoring and crash recovery\n3. Process lifecycle management\n4. User experience (visibility, feedback)\n5. Operational concerns (logging, metrics)\n\nSuccess criteria:\n- Can run for weeks without restart\n- Handles 50+ repositories efficiently\n- Recovers from crashes automatically\n- Users understand daemon status\n- Observable and debuggable","acceptance_criteria":"All child issues completed:\n- P0 issues: Storage cache, health checks, crash recovery, MCP cleanup\n- P1 issues: Global auto-start, visibility, version checks\n- P2 issues: Resource limits, telemetry, log rotation\n\nValidation:\n- Run daemon for 7+ days without issues\n- Test with 50+ repositories\n- Verify crash recovery\n- Confirm resource usage is bounded\n- Check metrics and logs are useful","status":"in_progress","priority":0,"issue_type":"epic","created_at":"2025-10-18T13:07:43.543715-07:00","updated_at":"2025-10-18T18:35:11.752924-07:00"}
 {"id":"bd-156","title":"Refactor import logic to eliminate duplication between manual and auto-import","description":"The import logic is duplicated in two places:\n1. cmd/bd/import.go (manual 'bd import' command)\n2. cmd/bd/main.go:autoImportIfNewer() (auto-import after git pull)\n\nBoth have nearly identical code for:\n- Reading and parsing JSONL\n- Type-asserting store to *sqlite.SQLiteStorage (where we just fixed a bug twice)\n- Opening direct SQLite connection when using daemon mode\n- Detecting collisions with sqlite.DetectCollisions()\n- Scoring and remapping collisions\n- Importing issues, dependencies, and labels\n\n**Problems:**\n- Bugs must be fixed in two places (we just did this for daemon mode)\n- Features must be implemented twice\n- Tests must cover both code paths\n- Harder to maintain and keep in sync\n- Higher risk of divergence over time\n\n**Proposed solution:**\nExtract a shared function that handles the core import logic:\n\n```go\n// importIssues handles the core import logic used by both manual and auto-import\nfunc importIssues(ctx context.Context, dbPath string, store storage.Storage, \n                  issues []*types.Issue, opts ImportOptions) (*ImportResult, error) {\n    // Handle SQLite store detection/creation for daemon mode\n    // Detect collisions\n    // Score and remap if needed\n    // Import issues, dependencies, labels\n    // Return result\n}\n```\n\nBoth import.go and autoImportIfNewer() would call this shared function with their specific options.\n\n**Benefits:**\n- Single source of truth for import logic\n- Bugs fixed once\n- Easier to test\n- Easier to extend with new import features\n- Less code overall","status":"closed","priority":2,"issue_type":"chore","created_at":"2025-10-18T17:07:06.007026-07:00","updated_at":"2025-10-18T18:35:11.753484-07:00","closed_at":"2025-10-18T17:11:20.280214-07:00"}
 {"id":"bd-157","title":"Complete auto-import refactoring to use shared importIssuesCore function","description":"The manual import command (bd import) was successfully refactored to use the shared importIssuesCore() function in import_shared.go, reducing code from 494 lines to 170 lines.\n\nHowever, autoImportIfNewer() in cmd/bd/main.go still has ~298 lines of duplicated import logic that should use the same shared function.\n\n**Current state:**\n- ✅ Manual import uses importIssuesCore() (commit 790233f)\n- ❌ Auto-import still has duplicated logic (lines 618-915 in main.go)\n\n**Duplication includes:**\n- SQLite store detection/creation for daemon mode (fixed in 790233f)\n- Collision detection with sqlite.DetectCollisions()\n- Scoring and remapping collisions\n- Importing issues (update existing, create new)\n- Importing dependencies\n- Importing labels\n\n**Benefits of completing this:**\n- Remove ~200 more lines of duplicated code\n- Ensure manual and auto-import have identical behavior\n- Future bug fixes only need to be made once\n- Easier to test and maintain\n\n**Implementation:**\nReplace lines 714-908 in autoImportIfNewer() with:\n```go\nopts := ImportOptions{\n    ResolveCollisions: true,  // Auto-import always resolves\n    DryRun:            false,\n    SkipUpdate:        false,\n    Strict:            false,\n}\nresult, err := importIssuesCore(ctx, dbPath, store, allIssues, opts)\n// Handle result and show remapping notification\n```\n\nThen update hash storage logic at the end.","status":"closed","priority":2,"issue_type":"chore","created_at":"2025-10-18T17:38:34.443872-07:00","updated_at":"2025-10-18T18:35:11.754006-07:00","closed_at":"2025-10-18T18:07:05.553928-07:00"}
-{"id":"bd-158","title":"Add .gitignore to prevent noisy untracked beads files","description":"When using beads, git status shows several untracked files in .beads/ directory: .beads/.gitignore, .beads/db.sqlite, daemon.pid and daemon.lock files. These should be added to the project's .gitignore to prevent noise.","status":"open","priority":2,"issue_type":"chore","created_at":"2025-10-18T18:27:16.424878-07:00","updated_at":"2025-10-18T18:35:11.754574-07:00"}
+{"id":"bd-158","title":"Add .gitignore to prevent noisy untracked beads files","description":"When using beads, git status shows several untracked files in .beads/ directory: .beads/.gitignore, .beads/db.sqlite, daemon.pid and daemon.lock files. These should be added to the project's .gitignore to prevent noise.","status":"closed","priority":2,"issue_type":"chore","created_at":"2025-10-18T18:27:16.424878-07:00","updated_at":"2025-10-19T09:05:48.4899-07:00","closed_at":"2025-10-19T09:05:48.4899-07:00"}
 {"id":"bd-159","title":"Implement --max-depth flag for bd dep tree","description":"PR #87 adds the flag but doesn't wire it through. Need to:\n1. Add flag definition in cmd/bd/dep.go\n2. Pass maxDepth to store.GetDependencyTree()\n3. Fix truncation warning to show actual depth used\n4. Add tests for truncation behavior (TestGetDependencyTree_TruncationDepth, TestGetDependencyTree_DefaultDepth)\n5. Update CLI docs/help\n\nDefault should remain 50. Keep using direct storage mode (no RPC needed for now).\n\nRelated: PR #87, bd-3","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-10-19T08:31:15.473267-07:00","updated_at":"2025-10-19T08:55:21.266386-07:00","closed_at":"2025-10-19T08:55:21.266386-07:00"}
 {"id":"bd-16","title":"Add EXPLAIN QUERY PLAN tests for ready work query","description":"Verify that the hierarchical blocking query uses proper indexes and doesn't do full table scans.\n\n**Queries to analyze:**\n1. The recursive CTE (both base case and recursive case)\n2. The final SELECT with NOT EXISTS\n3. Impact of various filters (status, priority, assignee)\n\n**Implementation:**\nAdd test function that:\n- Runs EXPLAIN QUERY PLAN on GetReadyWork query\n- Parses output to verify no SCAN TABLE operations\n- Documents expected query plan in comments\n- Fails if query plan degrades\n\n**Benefits:**\n- Catch performance regressions in tests\n- Document expected query behavior\n- Ensure indexes are being used\n\nRelated to: bd-77 (composite index on depends_on_id, type)","status":"closed","priority":3,"issue_type":"task","created_at":"2025-10-16T20:46:08.971822-07:00","updated_at":"2025-10-18T18:35:11.755001-07:00","closed_at":"2025-10-18T12:47:44.284846-07:00"}
+{"id":"bd-160","title":"Fix race condition in TestSocketCleanup","description":"Race condition detected in internal/rpc/rpc_test.go:195 in TestSocketCleanup. This is causing CI test failures.\n\nThe race appears to be between goroutines accessing shared state during server startup/shutdown in the socket cleanup test.\n\nLocation: internal/rpc/rpc_test.go:195\nTest output shows DATA RACE between goroutines 83 and 85.","status":"in_progress","priority":1,"issue_type":"bug","created_at":"2025-10-19T09:11:34.766584-07:00","updated_at":"2025-10-19T09:13:05.448924-07:00"}
 {"id":"bd-17","title":"Make auto-flush debounce duration configurable","description":"flushDebounce is hardcoded to 5 seconds. Make it configurable via environment variable BEADS_FLUSH_DEBOUNCE (e.g., '500ms', '10s'). Current 5-second value is reasonable for interactive use, but CI/automated scenarios might want faster flush. Add getDebounceDuration() helper function. Located in cmd/bd/main.go:31.","status":"closed","priority":3,"issue_type":"feature","created_at":"2025-10-16T20:46:08.971822-07:00","updated_at":"2025-10-18T18:35:11.755588-07:00","closed_at":"2025-10-18T09:47:43.22126-07:00"}
 {"id":"bd-18","title":"Optimize auto-flush to use incremental updates","description":"Every flush exports ALL issues and ALL dependencies, even if only one issue changed. For large projects (1000+ issues), this could be expensive. Current approach guarantees consistency, which is fine for MVP, but future optimization could track which issues changed and use incremental updates. Located in cmd/bd/main.go:255-276.","status":"closed","priority":3,"issue_type":"feature","created_at":"2025-10-16T20:46:08.971822-07:00","updated_at":"2025-10-18T18:35:11.755965-07:00","closed_at":"2025-10-14T02:51:52.200141-07:00"}
 {"id":"bd-19","title":"Refactor duplicate flush logic in PersistentPostRun","description":"PersistentPostRun contains a complete copy of the flush logic instead of calling flushToJSONL(). This violates DRY principle and makes maintenance harder. Refactor to use flushToJSONL() with a force parameter to bypass isDirty check, or extract shared logic into a helper function. Located in cmd/bd/main.go:104-138.","status":"closed","priority":3,"issue_type":"task","created_at":"2025-10-16T20:46:08.971822-07:00","updated_at":"2025-10-18T18:35:11.756336-07:00","closed_at":"2025-10-18T09:44:24.167574-07:00"}
@@ -38,5 +38,10 @@ Thumbs.db
 .beads/daemon.pid
 .beads/bd.sock

+# .beads directory files (keep JSONL only)
+.beads/.gitignore
+.beads/db.sqlite
+.beads/bd.db
+
 # Keep JSONL exports (source of truth for git)
 !.beads/*.jsonl
@@ -192,11 +192,32 @@ func TestSocketCleanup(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

-	go server.Start(ctx)
-	time.Sleep(100 * time.Millisecond)
+	// Start server in goroutine
+	started := make(chan error, 1)
+	go func() {
+		err := server.Start(ctx)
+		if err != nil {
+			started <- err
+		}
+	}()

-	if _, err := os.Stat(socketPath); os.IsNotExist(err) {
-		t.Fatal("Socket file not created")
+	// Wait for socket to be created (with timeout)
+	timeout := time.After(5 * time.Second)
+	ticker := time.NewTicker(10 * time.Millisecond)
+	defer ticker.Stop()
+
+	socketReady := false
+	for !socketReady {
+		select {
+		case err := <-started:
+			t.Fatalf("Server failed to start: %v", err)
+		case <-timeout:
+			t.Fatal("Timeout waiting for socket creation")
+		case <-ticker.C:
+			if _, err := os.Stat(socketPath); err == nil {
+				socketReady = true
+			}
+		}
 	}

 	if err := server.Stop(); err != nil {
@@ -94,23 +94,33 @@ func (s *Server) Start(ctx context.Context) error {
 		return fmt.Errorf("failed to remove old socket: %w", err)
 	}

-	var err error
-	s.listener, err = net.Listen("unix", s.socketPath)
+	listener, err := net.Listen("unix", s.socketPath)
 	if err != nil {
 		return fmt.Errorf("failed to listen on socket: %w", err)
 	}

 	// Set socket permissions to 0600 for security (owner only)
 	if err := os.Chmod(s.socketPath, 0600); err != nil {
-		s.listener.Close()
+		listener.Close()
 		return fmt.Errorf("failed to set socket permissions: %w", err)
 	}

+	// Store listener under lock
+	s.mu.Lock()
+	s.listener = listener
+	s.mu.Unlock()
+
 	go s.handleSignals()
 	go s.runCleanupLoop()

+	// Accept connections using listener
 	for {
-		conn, err := s.listener.Accept()
+		// Get listener under lock
+		s.mu.RLock()
+		listener := s.listener
+		s.mu.RUnlock()
+		
+		conn, err := listener.Accept()
 		if err != nil {
 			s.mu.Lock()
 			shutdown := s.shutdown
@@ -152,8 +162,14 @@ func (s *Server) Stop() error {
 			}
 		}

-		if s.listener != nil {
-			if closeErr := s.listener.Close(); closeErr != nil {
+		// Close listener under lock
+		s.mu.Lock()
+		listener := s.listener
+		s.listener = nil
+		s.mu.Unlock()
+		
+		if listener != nil {
+			if closeErr := listener.Close(); closeErr != nil {
 				err = fmt.Errorf("failed to close listener: %w", closeErr)
 				return
 			}