package main import ( "fmt" "os" "os/exec" "path/filepath" "strconv" "strings" "time" "github.com/steveyegge/beads/internal/config" "github.com/steveyegge/beads/internal/debug" "github.com/steveyegge/beads/internal/lockfile" "github.com/steveyegge/beads/internal/rpc" "github.com/steveyegge/beads/internal/ui" ) // daemonShutdownTimeout is how long to wait for graceful shutdown before force killing. // 1 second is sufficient - if daemon hasn't stopped by then, it's likely hung. const daemonShutdownTimeout = 1 * time.Second // daemonShutdownPollInterval is how often to check if daemon has stopped. const daemonShutdownPollInterval = 100 * time.Millisecond // daemonShutdownAttempts is the number of poll attempts before force kill. const daemonShutdownAttempts = int(daemonShutdownTimeout / daemonShutdownPollInterval) // Daemon start failure tracking for exponential backoff var ( lastDaemonStartAttempt time.Time daemonStartFailures int ) var ( executableFn = os.Executable execCommandFn = exec.Command openFileFn = os.OpenFile findProcessFn = os.FindProcess removeFileFn = os.Remove configureDaemonProcessFn = configureDaemonProcess waitForSocketReadinessFn = waitForSocketReadiness startDaemonProcessFn = startDaemonProcess isDaemonRunningFn = isDaemonRunning sendStopSignalFn = sendStopSignal ) // shouldAutoStartDaemon checks if daemon auto-start is enabled func shouldAutoStartDaemon() bool { // Check BEADS_NO_DAEMON first (escape hatch for single-user workflows) noDaemon := strings.ToLower(strings.TrimSpace(os.Getenv("BEADS_NO_DAEMON"))) if noDaemon == "1" || noDaemon == "true" || noDaemon == "yes" || noDaemon == "on" { return false // Explicit opt-out } // Check if we're in a git worktree without sync-branch configured. // In this case, daemon is unsafe because all worktrees share the same // .beads directory and the daemon would commit to the wrong branch. // When sync-branch is configured, daemon is safe because commits go // to a dedicated branch via an internal worktree. if shouldDisableDaemonForWorktree() { return false } // Use viper to read from config file or BEADS_AUTO_START_DAEMON env var // Viper handles BEADS_AUTO_START_DAEMON automatically via BindEnv return config.GetBool("auto-start-daemon") // Defaults to true } // restartDaemonForVersionMismatch stops the old daemon and starts a new one // Returns true if restart was successful func restartDaemonForVersionMismatch() bool { pidFile, err := getPIDFilePath() if err != nil { debug.Logf("failed to get PID file path: %v", err) return false } socketPath := getSocketPath() // Check if daemon is running and stop it forcedKill := false if isRunning, pid := isDaemonRunningFn(pidFile); isRunning { debug.Logf("stopping old daemon (PID %d)", pid) process, err := findProcessFn(pid) if err != nil { debug.Logf("failed to find process: %v", err) return false } // Send stop signal if err := sendStopSignalFn(process); err != nil { debug.Logf("failed to signal daemon: %v", err) return false } // Wait for daemon to stop, then force kill for i := 0; i < daemonShutdownAttempts; i++ { time.Sleep(daemonShutdownPollInterval) if isRunning, _ := isDaemonRunningFn(pidFile); !isRunning { debug.Logf("old daemon stopped successfully") break } } // Force kill if still running if isRunning, _ := isDaemonRunningFn(pidFile); isRunning { debug.Logf("force killing old daemon") _ = process.Kill() forcedKill = true } } // Clean up stale socket and PID file after force kill or if not running if forcedKill || !isDaemonRunningQuiet(pidFile) { _ = removeFileFn(socketPath) _ = removeFileFn(pidFile) } // Start new daemon with current binary version exe, err := executableFn() if err != nil { debug.Logf("failed to get executable path: %v", err) return false } args := []string{"daemon", "start"} cmd := execCommandFn(exe, args...) cmd.Env = append(os.Environ(), "BD_DAEMON_FOREGROUND=1") // Set working directory to database directory so daemon finds correct DB if dbPath != "" { cmd.Dir = filepath.Dir(dbPath) } configureDaemonProcessFn(cmd) devNull, err := openFileFn(os.DevNull, os.O_RDWR, 0) if err == nil { cmd.Stdin = devNull cmd.Stdout = devNull cmd.Stderr = devNull defer func() { _ = devNull.Close() }() } if err := cmd.Start(); err != nil { debug.Logf("failed to start new daemon: %v", err) return false } // Reap the process to avoid zombies go func() { _ = cmd.Wait() }() // Wait for daemon to be ready using shared helper if waitForSocketReadinessFn(socketPath, 5*time.Second) { debug.Logf("new daemon started successfully") return true } debug.Logf("new daemon failed to become ready") fmt.Fprintf(os.Stderr, "%s Daemon restart timed out (>5s). Running in direct mode.\n", ui.RenderWarn("Warning:")) fmt.Fprintf(os.Stderr, " %s Run 'bd doctor' to diagnose daemon issues\n", ui.RenderMuted("Hint:")) return false } // isDaemonRunningQuiet checks if daemon is running without output func isDaemonRunningQuiet(pidFile string) bool { isRunning, _ := isDaemonRunningFn(pidFile) return isRunning } // tryAutoStartDaemon attempts to start the daemon in the background // Returns true if daemon was started successfully and socket is ready func tryAutoStartDaemon(socketPath string) bool { if !canRetryDaemonStart() { debugLog("skipping auto-start due to recent failures") return false } if isDaemonHealthy(socketPath) { debugLog("daemon already running and healthy") return true } lockPath := socketPath + ".startlock" if !acquireStartLock(lockPath, socketPath) { return false } defer func() { if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) { debugLog("failed to remove lock file: %v", err) } }() if handleExistingSocket(socketPath) { return true } socketPath = determineSocketPath(socketPath) return startDaemonProcessFn(socketPath) } func debugLog(msg string, args ...interface{}) { debug.Logf(msg, args...) } func isDaemonHealthy(socketPath string) bool { client, err := rpc.TryConnect(socketPath) if err == nil && client != nil { _ = client.Close() return true } return false } func acquireStartLock(lockPath, socketPath string) bool { if err := ensureLockDirectory(lockPath); err != nil { debugLog("failed to ensure lock directory: %v", err) return false } // nolint:gosec // G304: lockPath is derived from secure beads directory lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600) if err != nil { // Lock file exists - check if daemon is actually starting lockPID, pidErr := readPIDFromFile(lockPath) if pidErr != nil || !isPIDAlive(lockPID) { // Stale lock from crashed process - clean up immediately (avoids 5s wait) debugLog("startlock is stale (PID %d dead or unreadable), cleaning up", lockPID) _ = os.Remove(lockPath) // Retry lock acquisition after cleanup return acquireStartLock(lockPath, socketPath) } // PID is alive - but is daemon actually running/starting? // Use flock-based check as authoritative source (immune to PID reuse) beadsDir := filepath.Dir(dbPath) if running, _ := lockfile.TryDaemonLock(beadsDir); !running { // Daemon lock not held - the start attempt failed or process was reused debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID) _ = os.Remove(lockPath) return acquireStartLock(lockPath, socketPath) } // Daemon lock is held - daemon is legitimately starting, wait for socket debugLog("another process (PID %d) is starting daemon, waiting for readiness", lockPID) if waitForSocketReadiness(socketPath, 5*time.Second) { return true } return handleStaleLock(lockPath, socketPath) } _, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid()) _ = lockFile.Close() // Best-effort close during startup return true } func handleStaleLock(lockPath, socketPath string) bool { lockPID, err := readPIDFromFile(lockPath) // Check if PID is dead if err != nil || !isPIDAlive(lockPID) { debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID) _ = os.Remove(lockPath) return tryAutoStartDaemon(socketPath) } // PID is alive - but check daemon lock as authoritative source (immune to PID reuse) beadsDir := filepath.Dir(dbPath) if running, _ := lockfile.TryDaemonLock(beadsDir); !running { debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID) _ = os.Remove(lockPath) return tryAutoStartDaemon(socketPath) } // Daemon lock is held - daemon is genuinely running but socket isn't ready // This shouldn't happen normally, but don't clean up a legitimate lock return false } func handleExistingSocket(socketPath string) bool { if _, err := os.Stat(socketPath); err != nil { return false } if canDialSocket(socketPath, 200*time.Millisecond) { debugLog("daemon started by another process") return true } // Use flock-based check as authoritative source (immune to PID reuse) // If daemon lock is not held, daemon is definitely dead regardless of PID file beadsDir := filepath.Dir(dbPath) if running, pid := lockfile.TryDaemonLock(beadsDir); running { debugLog("daemon lock held (PID %d), waiting for socket", pid) return waitForSocketReadiness(socketPath, 5*time.Second) } // Lock not held - daemon is dead, clean up stale artifacts debugLog("socket is stale (daemon lock not held), cleaning up") _ = os.Remove(socketPath) // Best-effort cleanup, file may not exist pidFile := getPIDFileForSocket(socketPath) if pidFile != "" { _ = os.Remove(pidFile) // Best-effort cleanup, file may not exist } // Also clean up daemon.lock file (contains stale metadata) lockFile := filepath.Join(beadsDir, "daemon.lock") _ = os.Remove(lockFile) // Best-effort cleanup return false } func determineSocketPath(socketPath string) string { return socketPath } // ensureLockDirectory ensures the parent directory exists for the lock file. // Needed when ShortSocketPath routes sockets into /tmp/beads-*/bd.sock. func ensureLockDirectory(lockPath string) error { dir := filepath.Dir(lockPath) if dir == "" { return fmt.Errorf("lock directory missing for %s", lockPath) } if _, err := os.Stat(dir); err == nil { return nil } else if !os.IsNotExist(err) { return err } return os.MkdirAll(dir, 0o700) } func startDaemonProcess(socketPath string) bool { // Early check: daemon requires a git repository (unless --local mode) // Skip attempting to start and avoid the 5-second wait if not in git repo if !isGitRepo() { debugLog("not in a git repository, skipping daemon start") fmt.Fprintf(os.Stderr, "%s No git repository initialized - running without background sync\n", ui.RenderMuted("Note:")) return false } binPath, err := executableFn() if err != nil { binPath = os.Args[0] } args := []string{"daemon", "start"} cmd := execCommandFn(binPath, args...) setupDaemonIO(cmd) if dbPath != "" { cmd.Dir = filepath.Dir(dbPath) } configureDaemonProcessFn(cmd) if err := cmd.Start(); err != nil { recordDaemonStartFailure() debugLog("failed to start daemon: %v", err) return false } go func() { _ = cmd.Wait() }() if waitForSocketReadinessFn(socketPath, 5*time.Second) { recordDaemonStartSuccess() return true } recordDaemonStartFailure() debugLog("daemon socket not ready after 5 seconds") // Check for daemon-error file which contains the actual failure reason beadsDir := filepath.Dir(dbPath) errFile := filepath.Join(beadsDir, "daemon-error") if errContent, err := os.ReadFile(errFile); err == nil && len(errContent) > 0 { // Show the actual error from the daemon fmt.Fprintf(os.Stderr, "%s Daemon failed to start:\n", ui.RenderWarn("Warning:")) fmt.Fprintf(os.Stderr, "%s\n", string(errContent)) return false } // Emit visible warning so user understands why command was slow fmt.Fprintf(os.Stderr, "%s Daemon took too long to start (>5s). Running in direct mode.\n", ui.RenderWarn("Warning:")) fmt.Fprintf(os.Stderr, " %s Run 'bd doctor' to diagnose daemon issues\n", ui.RenderMuted("Hint:")) return false } func setupDaemonIO(cmd *exec.Cmd) { devNull, err := openFileFn(os.DevNull, os.O_RDWR, 0) if err == nil { cmd.Stdout = devNull cmd.Stderr = devNull cmd.Stdin = devNull go func() { time.Sleep(1 * time.Second) _ = devNull.Close() }() } } // getPIDFileForSocket returns the PID file path. // Note: socketPath parameter is unused - PID file is always in .beads directory // (not socket directory, which may be in /tmp for short paths). func getPIDFileForSocket(_ string) string { dir := filepath.Dir(dbPath) return filepath.Join(dir, "daemon.pid") } // readPIDFromFile reads a PID from a file func readPIDFromFile(path string) (int, error) { // nolint:gosec // G304: path is derived from secure beads directory data, err := os.ReadFile(path) if err != nil { return 0, err } pid, err := strconv.Atoi(strings.TrimSpace(string(data))) if err != nil { return 0, err } return pid, nil } // isPIDAlive checks if a process with the given PID is running func isPIDAlive(pid int) bool { if pid <= 0 { return false } return isProcessRunning(pid) } // canDialSocket attempts a quick dial to the socket with a timeout func canDialSocket(socketPath string, timeout time.Duration) bool { client, err := rpc.TryConnectWithTimeout(socketPath, timeout) if err != nil || client == nil { return false } _ = client.Close() // Best-effort close after health check return true } // waitForSocketReadiness waits for daemon socket to be ready by testing actual connections // //nolint:unparam // timeout is configurable even though current callers use 5s func waitForSocketReadiness(socketPath string, timeout time.Duration) bool { deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { if canDialSocket(socketPath, 200*time.Millisecond) { return true } time.Sleep(100 * time.Millisecond) } return false } func canRetryDaemonStart() bool { if daemonStartFailures == 0 { return true } // Exponential backoff: 5s, 10s, 20s, 40s, 80s, 120s (capped at 120s) backoff := time.Duration(5*(1< 120*time.Second { backoff = 120 * time.Second } return time.Since(lastDaemonStartAttempt) > backoff } func recordDaemonStartSuccess() { daemonStartFailures = 0 } func recordDaemonStartFailure() { lastDaemonStartAttempt = time.Now() daemonStartFailures++ // No cap needed - backoff is capped at 120s in canRetryDaemonStart } // getSocketPath returns the daemon socket path based on the database location. // If BD_SOCKET env var is set, uses that value instead (enables test isolation). // On Unix systems, uses rpc.ShortSocketPath to avoid exceeding socket path limits // (macOS: 104 chars) by relocating long paths to /tmp/beads-{hash}/ (GH#1001). func getSocketPath() string { // Check environment variable first (enables test isolation) if socketPath := os.Getenv("BD_SOCKET"); socketPath != "" { return socketPath } // Get workspace path (parent of .beads directory) beadsDir := filepath.Dir(dbPath) workspacePath := filepath.Dir(beadsDir) return rpc.ShortSocketPath(workspacePath) } // emitVerboseWarning prints a one-line warning when falling back to direct mode func emitVerboseWarning() { switch daemonStatus.FallbackReason { case FallbackConnectFailed: fmt.Fprintf(os.Stderr, "Warning: Daemon unreachable at %s. Running in direct mode. Hint: bd daemon status\n", daemonStatus.SocketPath) case FallbackHealthFailed: fmt.Fprintf(os.Stderr, "Warning: Daemon unhealthy. Falling back to direct mode. Hint: bd daemon status --all\n") case FallbackAutoStartDisabled: fmt.Fprintf(os.Stderr, "Warning: Auto-start disabled (BEADS_AUTO_START_DAEMON=false). Running in direct mode. Hint: bd daemon\n") case FallbackAutoStartFailed: fmt.Fprintf(os.Stderr, "Warning: Failed to auto-start daemon. Running in direct mode. Hint: bd daemon status\n") case FallbackDaemonUnsupported: fmt.Fprintf(os.Stderr, "Warning: Daemon does not support this command yet. Running in direct mode. Hint: update daemon or use local mode.\n") case FallbackWorktreeSafety: // Don't warn - this is expected behavior. User can configure sync-branch to enable daemon. return case FallbackFlagNoDaemon: // Don't warn when user explicitly requested --no-daemon return } } func getDebounceDuration() time.Duration { duration := config.GetDuration("flush-debounce") if duration == 0 { // If parsing failed, use default return 5 * time.Second } return duration }