From 744563e87ff2bcbf9396c4828a72ed2e8754dfca Mon Sep 17 00:00:00 2001
From: Steve Yegge <stevey@sourcegraph.com>
Date: Sat, 25 Oct 2025 17:39:21 -0700
Subject: [PATCH] Add health checks and reconnection logic for stale daemon
 sockets (bd-137)

- Add ping() and health() methods to BdDaemonClient for connection verification
- Implement _health_check_client() to verify cached client connections
- Add _reconnect_client() with exponential backoff (0.1s, 0.2s, 0.4s, max 3 retries)
- Update _get_client() to health-check before returning cached clients
- Automatically detect and remove stale connections from pool
- Add comprehensive test suite with 14 tests covering all scenarios
- Handle daemon restarts, upgrades, and long-idle connections gracefully

Amp-Thread-ID: https://ampcode.com/threads/T-2366ef1b-389c-4293-8145-7613037c9dfa
Co-authored-by: Amp <amp@ampcode.com>
---
 .beads/beads.jsonl                            |   2 +-
 .../src/beads_mcp/bd_daemon_client.py         |  23 ++
 integrations/beads-mcp/src/beads_mcp/tools.py |  80 ++++-
 .../tests/test_daemon_health_check.py         | 304 ++++++++++++++++++
 4 files changed, 405 insertions(+), 4 deletions(-)
 create mode 100644 integrations/beads-mcp/tests/test_daemon_health_check.py

diff --git a/.beads/beads.jsonl b/.beads/beads.jsonl
index 1d3102a2..2b74175d 100644
--- a/.beads/beads.jsonl
+++ b/.beads/beads.jsonl
@@ -40,7 +40,7 @@
 {"id":"bd-134","title":"Address gosec security warnings (102 issues)","description":"Security linter warnings: file permissions (0755 should be 0750), G304 file inclusion via variable, G204 subprocess launches. Many are false positives but should be reviewed.","design":"Review each gosec warning. Add exclusions for legitimate cases to .golangci.yml. Fix real security issues (overly permissive file modes).","status":"open","priority":2,"issue_type":"task","created_at":"2025-10-25T13:47:10.719134-07:00","updated_at":"2025-10-25T13:47:10.719134-07:00"}
 {"id":"bd-135","title":"Multi-project MCP context switching (GH#145)","description":"Enable MCP server to manage multiple beads projects in a single session with per-request workspace_root parameter.\n\nCurrent bug: set_context(workspace_root) doesn't actually switch sockets - all operations hit first initialized socket.\n\nUse case: Managing tasks across multiple organizations with different permission models (internal, partners, open source).\n\nArchitecture: Connection pool keyed by workspace_root, each maintaining its own daemon socket connection. Request-scoped routing using ContextVar to avoid global state races.\n\nSee GH#145 for full requirements and user context.","design":"✅ APPROVED WITH MODIFICATIONS by architectural review (2025-10-25)\n\nLSP-style model is correct: Single MCP server → per-project daemons → isolated databases.\n\nCRITICAL CHANGES from review:\n1. **Simplify connection pool**: No LRU eviction initially (typical user: 2-5 projects)\n2. **Add asyncio.Lock**: Prevent race conditions in pool access\n3. **Defer health checks**: Only retry on failure, not preemptive pings\n4. **Handle submodules**: Check local .beads BEFORE git toplevel\n5. **Path canonicalization**: realpath + git toplevel with caching\n\nRISKS MITIGATED:\n- Global _client bug: Replace with connection pool keyed by canonical path\n- Race conditions: Add asyncio.Lock for pool mutations\n- Submodule edge case: Check .beads directory first\n- Stale sockets: Retry once on connection failure\n\nEstimated effort: 2.5-3.5 days (simplified from 2.5-4.5 days)\nConfidence: 8/10","acceptance_criteria":"- Multiple projects can be accessed in single MCP session\n- Per-request workspace_root parameter works on all tools\n- No cross-project data leakage\n- Concurrent calls to different projects work correctly\n- Stale sockets auto-reconnect with retry/backoff\n- Integration tests verify isolation across 2+ temp repos\n- set_context() still works as default fallback","notes":"Review doc: docs/bd-135-architectural-review.md\n\nImplementation order:\n1. bd-139 (connection manager) - Foundation with pool + lock\n2. bd-136 (ContextVar routing) - Per-request workspace\n3. bd-138 (require_context) - Validation\n4. bd-140 (tests) - Concurrency + edge cases\n5. bd-141 (docs) - Usage guide\n6. bd-142 (health checks) - DEFERRED to Phase 2","status":"closed","priority":1,"issue_type":"epic","assignee":"amp","created_at":"2025-10-25T13:59:57.231937-07:00","updated_at":"2025-10-25T14:36:02.046142-07:00","closed_at":"2025-10-25T14:36:02.046142-07:00"}
 {"id":"bd-136","title":"Implement request-scoped routing with ContextVar","description":"Add ContextVar-based routing to avoid global state races during concurrent multi-project calls.\n\nApproach:\n- Define current_workspace: ContextVar[str|None] in server.py\n- Add @with_workspace decorator that resolves workspace_root (via _resolve_workspace_root + realpath)\n- Set ContextVar for duration of tool call, reset after\n- Falls back to set_context default (BEADS_WORKING_DIR) if workspace_root not provided\n- beads_mcp.tools.get_client() reads current_workspace from ContextVar\n\nBlocks: bd-135 (connection manager must exist first)","design":"Decorator pattern with ContextVar for request-scoped workspace routing.\n\n@with_workspace decorator:\n- Extract workspace_root parameter from tool call\n- Resolve via _resolve_workspace_root + realpath\n- Set current_workspace ContextVar for request duration\n- Falls back to BEADS_WORKING_DIR if workspace_root not provided\n- Reset ContextVar after tool completes\n\nApplied to all tools in server.py. _get_client() reads current_workspace.\n\n⚠️ CONCURRENCY GOTCHA (from architectural review):\n- ContextVar doesn't propagate to asyncio.create_task() spawned tasks\n- SOLUTION: Keep tool calls synchronous, no background task spawning\n- If background tasks needed: use contextvars.copy_context()\n\nDocument this limitation in bd-141.","notes":"Blocks on bd-139 (connection pool must exist first).\n\nCRITICAL: Do NOT spawn background tasks within tool implementations.\nContextVar propagation to spawned tasks is unreliable.","status":"closed","priority":1,"issue_type":"task","assignee":"amp","created_at":"2025-10-25T14:00:27.895512-07:00","updated_at":"2025-10-25T14:32:36.531658-07:00","closed_at":"2025-10-25T14:32:36.531658-07:00","dependencies":[{"issue_id":"bd-136","depends_on_id":"bd-135","type":"parent-child","created_at":"2025-10-25T14:00:27.896366-07:00","created_by":"daemon"}]}
-{"id":"bd-137","title":"Add health checks and reconnection logic for stale sockets","description":"Handle stale/broken socket connections after daemon restarts or upgrades.\n\nFeatures:\n- ping/health_check method on client\n- Check before use or periodic health check\n- On failure: drop from pool, attempt reconnect with exponential backoff\n- Clear error propagation if reconnect fails after retries\n- Handle version mismatch after daemon upgrade\n- Handle long-idle connections closed by daemon","design":"Add async ping() to DaemonClient. ConnectionManager.get_client() pings before returning cached client. On failure, del from pool and retry connect. Bounded retry (3-5 attempts) with backoff.","status":"open","priority":1,"issue_type":"task","created_at":"2025-10-25T14:00:27.8967-07:00","updated_at":"2025-10-25T14:00:27.8967-07:00"}
+{"id":"bd-137","title":"Add health checks and reconnection logic for stale sockets","description":"Handle stale/broken socket connections after daemon restarts or upgrades.\n\nFeatures:\n- ping/health_check method on client\n- Check before use or periodic health check\n- On failure: drop from pool, attempt reconnect with exponential backoff\n- Clear error propagation if reconnect fails after retries\n- Handle version mismatch after daemon upgrade\n- Handle long-idle connections closed by daemon","design":"Add async ping() to DaemonClient. ConnectionManager.get_client() pings before returning cached client. On failure, del from pool and retry connect. Bounded retry (3-5 attempts) with backoff.","status":"in_progress","priority":1,"issue_type":"task","created_at":"2025-10-25T14:00:27.8967-07:00","updated_at":"2025-10-25T17:28:05.384177-07:00"}
 {"id":"bd-138","title":"Fix require_context to support per-request workspace_root","description":"Update require_context decorator to pass if either:\n- workspace_root was provided on the tool call (via ContextVar), OR\n- BEADS_WORKING_DIR is set (from set_context)\n\nStop using BEADS_DB as router - treat it only as CLI fallback.\n\nEnsures backward compatibility with set_context() while supporting new per-request routing.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-10-25T14:00:27.896646-07:00","updated_at":"2025-10-25T14:32:36.533618-07:00","closed_at":"2025-10-25T14:32:36.533618-07:00","dependencies":[{"issue_id":"bd-138","depends_on_id":"bd-135","type":"parent-child","created_at":"2025-10-25T14:00:27.89804-07:00","created_by":"daemon"}]}
 {"id":"bd-139","title":"Add connection manager for per-project daemon sockets","description":"Create connection pool for managing multiple daemon socket connections in beads_mcp.tools.\n\nSIMPLIFIED APPROACH (per architectural review):\n- Connection pool dict[str, BdClientBase] keyed by canonical path\n- Add asyncio.Lock for thread-safe pool access (CRITICAL)\n- Path canonicalization with @lru_cache for performance\n- NO LRU eviction (start simple, add if monitoring shows memory issues)\n- NO preemptive health checks (only retry on failure)\n\nFeatures:\n- get_client(workspace_root) → returns client for that project\n- Canonicalization: realpath + git toplevel (handles symlinks)\n- Submodule-aware: Check local .beads BEFORE git toplevel\n- Auto-start daemon if socket missing (existing behavior)\n- Validate version on connect (existing behavior)\n\nCRITICAL: Replace global _client singleton with pool to fix set_context() bug.","design":"Implementation in tools.py:\n\n```python\nfrom contextvars import ContextVar\nimport asyncio\nfrom functools import lru_cache\n\ncurrent_workspace: ContextVar[str | None] = ContextVar('workspace', default=None)\n_connection_pool: Dict[str, BdClientBase] = {}\n_pool_lock = asyncio.Lock()  # Prevent race conditions\n\n@lru_cache(maxsize=128)\ndef _canonicalize_path(path: str) -\u003e str:\n    # 1. realpath to resolve symlinks\n    real = os.path.realpath(path)\n    \n    # 2. Check local .beads (submodule edge case)\n    if os.path.exists(os.path.join(real, \".beads\")):\n        return real\n    \n    # 3. Try git toplevel\n    return _resolve_workspace_root(real)\n\nasync def _get_client() -\u003e BdClientBase:\n    workspace = current_workspace.get() or os.environ.get(\"BEADS_WORKING_DIR\")\n    if not workspace:\n        raise BdError(\"No workspace set\")\n    \n    canonical = _canonicalize_path(workspace)\n    \n    async with _pool_lock:  # CRITICAL: prevent concurrent mutations\n        if canonical not in _connection_pool:\n            _connection_pool[canonical] = create_bd_client(\n                prefer_daemon=True,\n                working_dir=canonical\n            )\n        return _connection_pool[canonical]\n```\n\n~200 LOC total (simplified from ~400-500)","status":"closed","priority":1,"issue_type":"task","assignee":"amp","created_at":"2025-10-25T14:00:27.896896-07:00","updated_at":"2025-10-25T14:31:08.910695-07:00","closed_at":"2025-10-25T14:31:08.910695-07:00","dependencies":[{"issue_id":"bd-139","depends_on_id":"bd-135","type":"parent-child","created_at":"2025-10-25T14:00:27.89917-07:00","created_by":"daemon"}]}
 {"id":"bd-14","title":"Auto-flush writes test pollution and session work to git-tracked issues.jsonl","description":"Auto-flush exports ALL issues from DB to issues.jsonl every 5 seconds, including:\n- Test issues (bd-4053 through bd-4059 were version test junk)\n- Issues created during debugging sessions\n- Test pollution from stress tests\n- Temporary diagnostic issues\n\nThis pollutes the git-tracked issues.jsonl with garbage that shouldn't be committed.\n\nExample from today:\n- Git had 49 clean issues\n- Our DB grew to 100+ with test junk and session work\n- Auto-flush wrote all 100+ to issues.jsonl\n- Git status showed modified issues.jsonl with 50+ unwanted issues\n\nImpact:\n- Pollutes git history with test/debug garbage\n- Makes code review difficult (noise in diffs)\n- Can't distinguish real work from session artifacts\n- Other team members pull polluted issues\n\nSolutions to consider:\n1. Disable auto-flush by default (require explicit --enable-auto-flush)\n2. Add .beadsignore to exclude issue ID patterns\n3. Make auto-flush only export 'real' issues (exclude test-*)\n4. Require manual 'bd sync' for git commit\n5. Auto-flush to separate file (.beads/session.jsonl vs issues.jsonl)\n\nRelated: bd-117 (test pollution), isolation_test.go (test DB separation)","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-10-22T00:05:10.788996-07:00","updated_at":"2025-10-24T13:51:54.437366-07:00","closed_at":"2025-10-22T01:05:59.459797-07:00"}
diff --git a/integrations/beads-mcp/src/beads_mcp/bd_daemon_client.py b/integrations/beads-mcp/src/beads_mcp/bd_daemon_client.py
index 08861007..5d10d22a 100644
--- a/integrations/beads-mcp/src/beads_mcp/bd_daemon_client.py
+++ b/integrations/beads-mcp/src/beads_mcp/bd_daemon_client.py
@@ -200,10 +200,33 @@ class BdDaemonClient(BdClientBase):
 
         Raises:
             DaemonNotRunningError: If daemon is not running
+            DaemonConnectionError: If connection fails
+            DaemonError: If request fails
         """
         data = await self._send_request("ping", {})
         return json.loads(data) if isinstance(data, str) else data
 
+    async def health(self) -> Dict[str, Any]:
+        """Get daemon health status.
+
+        Returns:
+            Dict with health info including:
+            - status: "healthy" | "degraded" | "unhealthy"
+            - version: daemon version string
+            - uptime: uptime in seconds
+            - cache_size: number of cached databases
+            - db_response_time_ms: database ping time
+            - active_connections: number of active connections
+            - memory_bytes: memory usage
+
+        Raises:
+            DaemonNotRunningError: If daemon is not running
+            DaemonConnectionError: If connection fails
+            DaemonError: If request fails
+        """
+        data = await self._send_request("health", {})
+        return json.loads(data) if isinstance(data, str) else data
+
     async def init(self, params: Optional[InitParams] = None) -> str:
         """Initialize new beads database (not typically used via daemon).
 
diff --git a/integrations/beads-mcp/src/beads_mcp/tools.py b/integrations/beads-mcp/src/beads_mcp/tools.py
index d0454d15..60139d55 100644
--- a/integrations/beads-mcp/src/beads_mcp/tools.py
+++ b/integrations/beads-mcp/src/beads_mcp/tools.py
@@ -110,12 +110,76 @@ def _canonicalize_path(path: str) -> str:
     return _resolve_workspace_root(real)
 
 
+async def _health_check_client(client: BdClientBase) -> bool:
+    """Check if a client is healthy and responsive.
+    
+    Args:
+        client: Client to health check
+        
+    Returns:
+        True if client is healthy, False otherwise
+    """
+    # Only health check daemon clients
+    if not hasattr(client, 'ping'):
+        return True
+    
+    try:
+        await client.ping()
+        return True
+    except Exception:
+        # Any exception means the client is stale/unhealthy
+        return False
+
+
+async def _reconnect_client(canonical: str, max_retries: int = 3) -> BdClientBase:
+    """Attempt to reconnect to daemon with exponential backoff.
+    
+    Args:
+        canonical: Canonical workspace path
+        max_retries: Maximum number of retry attempts (default: 3)
+        
+    Returns:
+        New client instance
+        
+    Raises:
+        BdError: If all reconnection attempts fail
+    """
+    use_daemon = os.environ.get("BEADS_USE_DAEMON", "1") == "1"
+    
+    for attempt in range(max_retries):
+        try:
+            client = create_bd_client(
+                prefer_daemon=use_daemon,
+                working_dir=canonical
+            )
+            
+            # Verify new client works
+            if await _health_check_client(client):
+                _register_client_for_cleanup(client)
+                return client
+                
+        except Exception:
+            if attempt < max_retries - 1:
+                # Exponential backoff: 0.1s, 0.2s, 0.4s
+                backoff = 0.1 * (2 ** attempt)
+                await asyncio.sleep(backoff)
+            continue
+    
+    raise BdError(
+        f"Failed to connect to daemon after {max_retries} attempts. "
+        "The daemon may be stopped or unresponsive."
+    )
+
+
 async def _get_client() -> BdClientBase:
     """Get a BdClient instance for the current workspace.
     
     Uses connection pool to manage per-project daemon sockets.
     Workspace is determined by current_workspace ContextVar or BEADS_WORKING_DIR env.
 
+    Performs health check before returning cached client.
+    On failure, drops from pool and attempts reconnection with exponential backoff.
+    
     Performs version check on first connection to each workspace.
     Uses daemon client if available, falls back to CLI client.
 
@@ -137,7 +201,19 @@ async def _get_client() -> BdClientBase:
     
     # Thread-safe connection pool access
     async with _pool_lock:
-        if canonical not in _connection_pool:
+        if canonical in _connection_pool:
+            # Health check cached client before returning
+            client = _connection_pool[canonical]
+            if not await _health_check_client(client):
+                # Stale connection - remove from pool and reconnect
+                del _connection_pool[canonical]
+                if canonical in _version_checked:
+                    _version_checked.remove(canonical)
+                
+                # Attempt reconnection with backoff
+                client = await _reconnect_client(canonical)
+                _connection_pool[canonical] = client
+        else:
             # Create new client for this workspace
             use_daemon = os.environ.get("BEADS_USE_DAEMON", "1") == "1"
             
@@ -151,8 +227,6 @@ async def _get_client() -> BdClientBase:
             
             # Add to pool
             _connection_pool[canonical] = client
-            
-        client = _connection_pool[canonical]
     
     # Check version once per workspace (only for CLI client)
     if canonical not in _version_checked:
diff --git a/integrations/beads-mcp/tests/test_daemon_health_check.py b/integrations/beads-mcp/tests/test_daemon_health_check.py
new file mode 100644
index 00000000..3bcae46d
--- /dev/null
+++ b/integrations/beads-mcp/tests/test_daemon_health_check.py
@@ -0,0 +1,304 @@
+"""Tests for daemon health check and reconnection logic."""
+
+import asyncio
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from beads_mcp.bd_client import BdError
+from beads_mcp.bd_daemon_client import (
+    BdDaemonClient,
+    DaemonConnectionError,
+    DaemonError,
+    DaemonNotRunningError,
+)
+from beads_mcp.tools import _get_client, _health_check_client, _reconnect_client
+
+
+@pytest.mark.asyncio
+async def test_daemon_client_ping_success():
+    """Test successful ping to daemon."""
+    client = BdDaemonClient(socket_path="/tmp/bd.sock", working_dir="/tmp/test")
+    
+    with patch.object(client, '_send_request', new_callable=AsyncMock) as mock_send:
+        mock_send.return_value = {"message": "pong", "version": "0.9.10"}
+        
+        result = await client.ping()
+        
+        assert result["message"] == "pong"
+        assert result["version"] == "0.9.10"
+        mock_send.assert_called_once_with("ping", {})
+
+
+@pytest.mark.asyncio
+async def test_daemon_client_ping_connection_error():
+    """Test ping when daemon connection fails."""
+    client = BdDaemonClient(socket_path="/tmp/bd.sock", working_dir="/tmp/test")
+    
+    with patch.object(client, '_send_request', new_callable=AsyncMock) as mock_send:
+        mock_send.side_effect = DaemonConnectionError("Connection failed")
+        
+        with pytest.raises(DaemonConnectionError):
+            await client.ping()
+
+
+@pytest.mark.asyncio
+async def test_daemon_client_health_success():
+    """Test successful health check to daemon."""
+    client = BdDaemonClient(socket_path="/tmp/bd.sock", working_dir="/tmp/test")
+    
+    with patch.object(client, '_send_request', new_callable=AsyncMock) as mock_send:
+        mock_send.return_value = {
+            "status": "healthy",
+            "version": "0.9.10",
+            "uptime": 123.45,
+            "cache_size": 5,
+            "db_response_time_ms": 2.5,
+            "active_connections": 3,
+            "memory_bytes": 104857600,
+        }
+        
+        result = await client.health()
+        
+        assert result["status"] == "healthy"
+        assert result["version"] == "0.9.10"
+        assert result["uptime"] == 123.45
+        mock_send.assert_called_once_with("health", {})
+
+
+@pytest.mark.asyncio
+async def test_daemon_client_health_unhealthy():
+    """Test health check when daemon is unhealthy."""
+    client = BdDaemonClient(socket_path="/tmp/bd.sock", working_dir="/tmp/test")
+    
+    with patch.object(client, '_send_request', new_callable=AsyncMock) as mock_send:
+        mock_send.return_value = {
+            "status": "unhealthy",
+            "error": "Database connection failed",
+        }
+        
+        result = await client.health()
+        
+        assert result["status"] == "unhealthy"
+        assert "error" in result
+
+
+@pytest.mark.asyncio
+async def test_health_check_client_daemon_client_healthy():
+    """Test health check for healthy daemon client."""
+    client = BdDaemonClient(socket_path="/tmp/bd.sock", working_dir="/tmp/test")
+    
+    with patch.object(client, 'ping', new_callable=AsyncMock) as mock_ping:
+        mock_ping.return_value = {"message": "pong", "version": "0.9.10"}
+        
+        result = await _health_check_client(client)
+        
+        assert result is True
+        mock_ping.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_health_check_client_daemon_client_unhealthy():
+    """Test health check for unhealthy daemon client."""
+    client = BdDaemonClient(socket_path="/tmp/bd.sock", working_dir="/tmp/test")
+    
+    with patch.object(client, 'ping', new_callable=AsyncMock) as mock_ping:
+        mock_ping.side_effect = DaemonConnectionError("Connection failed")
+        
+        result = await _health_check_client(client)
+        
+        assert result is False
+
+
+@pytest.mark.asyncio
+async def test_health_check_client_cli_client():
+    """Test health check for CLI client (always returns True)."""
+    from beads_mcp.bd_client import BdClient
+    
+    client = BdClient(bd_path="/usr/bin/bd", beads_db="/tmp/test.db")
+    
+    result = await _health_check_client(client)
+    
+    # CLI clients don't have ping, so they're always considered healthy
+    assert result is True
+
+
+@pytest.mark.asyncio
+async def test_reconnect_client_success():
+    """Test successful reconnection after failure."""
+    from beads_mcp.bd_client import create_bd_client
+    
+    with (
+        patch('beads_mcp.tools.create_bd_client') as mock_create,
+        patch('beads_mcp.tools._health_check_client', new_callable=AsyncMock) as mock_health,
+        patch('beads_mcp.tools._register_client_for_cleanup') as mock_register,
+    ):
+        mock_client = MagicMock()
+        mock_create.return_value = mock_client
+        mock_health.return_value = True
+        
+        result = await _reconnect_client("/tmp/test")
+        
+        assert result == mock_client
+        mock_create.assert_called_once_with(prefer_daemon=True, working_dir="/tmp/test")
+        mock_register.assert_called_once_with(mock_client)
+
+
+@pytest.mark.asyncio
+async def test_reconnect_client_retry_with_backoff():
+    """Test reconnection with exponential backoff on failure."""
+    # Need to patch asyncio.sleep in the actual module where it's called
+    import beads_mcp.tools as tools_module
+    
+    with (
+        patch.object(tools_module, 'create_bd_client') as mock_create,
+        patch.object(tools_module, '_health_check_client', new_callable=AsyncMock) as mock_health,
+        patch.object(tools_module, '_register_client_for_cleanup') as mock_register,
+    ):
+        mock_client = MagicMock()
+        
+        # Raise exception first two times, succeed third time
+        mock_create.side_effect = [
+            Exception("Connection failed"),
+            Exception("Connection failed"),
+            mock_client,
+        ]
+        mock_health.return_value = True
+        
+        # Mock asyncio.sleep to track calls
+        sleep_calls = []
+        async def mock_sleep(duration):
+            sleep_calls.append(duration)
+            # Don't actually sleep in tests
+            return
+        
+        with patch.object(asyncio, 'sleep', side_effect=mock_sleep):
+            result = await _reconnect_client("/tmp/test", max_retries=3)
+        
+        assert result == mock_client
+        assert mock_create.call_count == 3
+        assert len(sleep_calls) == 2
+        
+        # Verify exponential backoff: 0.1s, 0.2s
+        assert sleep_calls[0] == 0.1
+        assert sleep_calls[1] == 0.2
+
+
+@pytest.mark.asyncio
+async def test_reconnect_client_max_retries_exceeded():
+    """Test reconnection failure after max retries."""
+    with (
+        patch('beads_mcp.tools.create_bd_client') as mock_create,
+        patch('beads_mcp.tools._health_check_client', new_callable=AsyncMock) as mock_health,
+        patch('asyncio.sleep', new_callable=AsyncMock),
+    ):
+        mock_client = MagicMock()
+        mock_create.return_value = mock_client
+        mock_health.return_value = False  # Always fail health check
+        
+        with pytest.raises(BdError, match="Failed to connect to daemon after 3 attempts"):
+            await _reconnect_client("/tmp/test", max_retries=3)
+        
+        assert mock_create.call_count == 3
+
+
+@pytest.mark.asyncio
+async def test_get_client_uses_cached_healthy_client(monkeypatch):
+    """Test that _get_client returns cached client if healthy."""
+    from beads_mcp import tools
+    
+    # Set up environment
+    monkeypatch.setenv("BEADS_WORKING_DIR", "/tmp/test")
+    
+    mock_client = MagicMock()
+    mock_client._check_version = AsyncMock()
+    
+    with (
+        patch('beads_mcp.tools._canonicalize_path', return_value="/tmp/test"),
+        patch('beads_mcp.tools._health_check_client', new_callable=AsyncMock) as mock_health,
+    ):
+        mock_health.return_value = True
+        
+        # Add mock client to pool and mark as version checked
+        tools._connection_pool["/tmp/test"] = mock_client
+        tools._version_checked.add("/tmp/test")
+        
+        result = await _get_client()
+        
+        assert result == mock_client
+        mock_health.assert_called_once_with(mock_client)
+
+
+@pytest.mark.asyncio
+async def test_get_client_reconnects_on_stale_connection(monkeypatch):
+    """Test that _get_client reconnects when cached client is stale."""
+    from beads_mcp import tools
+    
+    # Set up environment
+    monkeypatch.setenv("BEADS_WORKING_DIR", "/tmp/test")
+    
+    old_client = MagicMock()
+    new_client = MagicMock()
+    new_client._check_version = AsyncMock()
+    
+    with (
+        patch('beads_mcp.tools._canonicalize_path', return_value="/tmp/test"),
+        patch('beads_mcp.tools._health_check_client', new_callable=AsyncMock) as mock_health,
+        patch('beads_mcp.tools._reconnect_client', new_callable=AsyncMock) as mock_reconnect,
+    ):
+        # First health check fails (stale), reconnect returns new client
+        mock_health.return_value = False
+        mock_reconnect.return_value = new_client
+        
+        # Add old client to pool
+        tools._connection_pool["/tmp/test"] = old_client
+        tools._version_checked.add("/tmp/test")
+        
+        result = await _get_client()
+        
+        assert result == new_client
+        assert tools._connection_pool["/tmp/test"] == new_client
+        # Version check is performed after reconnect, so it's back in the set
+        assert "/tmp/test" in tools._version_checked
+        mock_reconnect.assert_called_once_with("/tmp/test")
+
+
+@pytest.mark.asyncio
+async def test_get_client_creates_new_client_if_not_cached(monkeypatch):
+    """Test that _get_client creates new client if not in pool."""
+    from beads_mcp import tools
+    
+    # Clear pool
+    tools._connection_pool.clear()
+    tools._version_checked.clear()
+    
+    # Set up environment
+    monkeypatch.setenv("BEADS_WORKING_DIR", "/tmp/test")
+    
+    mock_client = MagicMock()
+    mock_client._check_version = AsyncMock()
+    
+    with (
+        patch('beads_mcp.tools._canonicalize_path', return_value="/tmp/test"),
+        patch('beads_mcp.tools.create_bd_client', return_value=mock_client) as mock_create,
+        patch('beads_mcp.tools._register_client_for_cleanup') as mock_register,
+    ):
+        result = await _get_client()
+        
+        assert result == mock_client
+        assert tools._connection_pool["/tmp/test"] == mock_client
+        mock_create.assert_called_once_with(prefer_daemon=True, working_dir="/tmp/test")
+        mock_register.assert_called_once_with(mock_client)
+
+
+@pytest.mark.asyncio
+async def test_get_client_no_workspace_error():
+    """Test that _get_client raises error if no workspace is set."""
+    from beads_mcp import tools
+    
+    # Clear context
+    tools.current_workspace.set(None)
+    
+    with patch.dict('os.environ', {}, clear=True):
+        with pytest.raises(BdError, match="No workspace set"):
+            await _get_client()