Files
beads/cmd/bd/daemon_autostart.go
aleiby 028921b04a fix(daemon): prevent stack overflow on empty database path (#1288) (#1313)
When metadata.json has an empty database field and no beads.db file exists,
filepath.Dir("") returns "." which causes lock file operations to use the
current directory instead of the beads directory. This could lead to stack
overflow or unexpected behavior.

Add early validation in tryAutoStartDaemon to check if dbPath is empty and
return false gracefully, consistent with other early-return conditions.
2026-01-25 17:59:57 -08:00

666 lines
21 KiB
Go

package main
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/steveyegge/beads/internal/beads"
"github.com/steveyegge/beads/internal/config"
"github.com/steveyegge/beads/internal/configfile"
"github.com/steveyegge/beads/internal/debug"
"github.com/steveyegge/beads/internal/lockfile"
"github.com/steveyegge/beads/internal/rpc"
"github.com/steveyegge/beads/internal/ui"
)
// daemonShutdownTimeout is how long to wait for graceful shutdown before force killing.
// 1 second is sufficient - if daemon hasn't stopped by then, it's likely hung.
const daemonShutdownTimeout = 1 * time.Second
// daemonShutdownPollInterval is how often to check if daemon has stopped.
const daemonShutdownPollInterval = 100 * time.Millisecond
// daemonShutdownAttempts is the number of poll attempts before force kill.
const daemonShutdownAttempts = int(daemonShutdownTimeout / daemonShutdownPollInterval)
// Daemon start failure tracking for exponential backoff
var (
lastDaemonStartAttempt time.Time
daemonStartFailures int
)
var (
executableFn = os.Executable
execCommandFn = exec.Command
openFileFn = os.OpenFile
findProcessFn = os.FindProcess
removeFileFn = os.Remove
configureDaemonProcessFn = configureDaemonProcess
waitForSocketReadinessFn = waitForSocketReadiness
startDaemonProcessFn = startDaemonProcess
isDaemonRunningFn = isDaemonRunning
sendStopSignalFn = sendStopSignal
)
// isDoltBackend returns true if the current workspace uses any Dolt backend mode.
// The daemon is only needed for SQLite - Dolt has its own sync mechanism.
func isDoltBackend() bool {
beadsDir := ""
if dbPath != "" {
beadsDir = filepath.Dir(dbPath)
} else if found := beads.FindDatabasePath(); found != "" {
beadsDir = filepath.Dir(found)
} else {
beadsDir = beads.FindBeadsDir()
}
if beadsDir == "" {
return false
}
cfg, err := configfile.Load(beadsDir)
if err != nil || cfg == nil {
return false
}
return cfg.GetBackend() == configfile.BackendDolt
}
// singleProcessOnlyBackend returns true if the current workspace backend is configured
// as single-process-only (currently Dolt embedded).
//
// Best-effort: if we can't determine the backend, we return false and defer to other logic.
func singleProcessOnlyBackend() bool {
// Prefer dbPath if set; it points to either .beads/<db>.db (sqlite) or .beads/dolt (dolt dir).
beadsDir := ""
if dbPath != "" {
beadsDir = filepath.Dir(dbPath)
} else if found := beads.FindDatabasePath(); found != "" {
beadsDir = filepath.Dir(found)
} else {
beadsDir = beads.FindBeadsDir()
}
if beadsDir == "" {
return false
}
cfg, err := configfile.Load(beadsDir)
if err != nil || cfg == nil {
return false
}
// Use GetCapabilities() to properly handle Dolt server mode
return cfg.GetCapabilities().SingleProcessOnly
}
// shouldAutoStartDaemon checks if daemon auto-start is enabled
func shouldAutoStartDaemon() bool {
// Dolt backend doesn't need daemon - it has its own sync via dolt sql-server.
// This applies to both embedded and server modes.
if isDoltBackend() {
return false
}
// For other backends, check SingleProcessOnly capability
if singleProcessOnlyBackend() {
return false
}
// Check BEADS_NO_DAEMON first (escape hatch for single-user workflows)
noDaemon := strings.ToLower(strings.TrimSpace(os.Getenv("BEADS_NO_DAEMON")))
if noDaemon == "1" || noDaemon == "true" || noDaemon == "yes" || noDaemon == "on" {
return false // Explicit opt-out
}
// Check if we're in a git worktree without sync-branch configured.
// In this case, daemon is unsafe because all worktrees share the same
// .beads directory and the daemon would commit to the wrong branch.
// When sync-branch is configured, daemon is safe because commits go
// to a dedicated branch via an internal worktree.
if shouldDisableDaemonForWorktree() {
return false
}
// Use viper to read from config file or BEADS_AUTO_START_DAEMON env var
// Viper handles BEADS_AUTO_START_DAEMON automatically via BindEnv
return config.GetBool("auto-start-daemon") // Defaults to true
}
// restartDaemonForVersionMismatch stops the old daemon and starts a new one
// Returns true if restart was successful
func restartDaemonForVersionMismatch() bool {
// Dolt backend doesn't need daemon - it has its own sync mechanism.
if isDoltBackend() {
debugLog("dolt backend: skipping daemon restart for version mismatch")
return false
}
// For other backends, check SingleProcessOnly capability
if singleProcessOnlyBackend() {
debugLog("single-process backend: skipping daemon restart for version mismatch")
return false
}
pidFile, err := getPIDFilePath()
if err != nil {
debug.Logf("failed to get PID file path: %v", err)
return false
}
socketPath := getSocketPath()
// Check if daemon is running and stop it
forcedKill := false
if isRunning, pid := isDaemonRunningFn(pidFile); isRunning {
debug.Logf("stopping old daemon (PID %d)", pid)
process, err := findProcessFn(pid)
if err != nil {
debug.Logf("failed to find process: %v", err)
return false
}
// Send stop signal
if err := sendStopSignalFn(process); err != nil {
debug.Logf("failed to signal daemon: %v", err)
return false
}
// Wait for daemon to stop, then force kill
for i := 0; i < daemonShutdownAttempts; i++ {
time.Sleep(daemonShutdownPollInterval)
if isRunning, _ := isDaemonRunningFn(pidFile); !isRunning {
debug.Logf("old daemon stopped successfully")
break
}
}
// Force kill if still running
if isRunning, _ := isDaemonRunningFn(pidFile); isRunning {
debug.Logf("force killing old daemon")
_ = process.Kill()
forcedKill = true
}
}
// Clean up stale socket and PID file after force kill or if not running
if forcedKill || !isDaemonRunningQuiet(pidFile) {
_ = removeFileFn(socketPath)
_ = removeFileFn(pidFile)
}
// Start new daemon with current binary version
exe, err := executableFn()
if err != nil {
debug.Logf("failed to get executable path: %v", err)
return false
}
args := []string{"daemon", "start"}
cmd := execCommandFn(exe, args...)
cmd.Env = append(os.Environ(), "BD_DAEMON_FOREGROUND=1")
// Set working directory to database directory so daemon finds correct DB
if dbPath != "" {
cmd.Dir = filepath.Dir(dbPath)
}
configureDaemonProcessFn(cmd)
devNull, err := openFileFn(os.DevNull, os.O_RDWR, 0)
if err == nil {
cmd.Stdin = devNull
cmd.Stdout = devNull
cmd.Stderr = devNull
defer func() { _ = devNull.Close() }()
}
if err := cmd.Start(); err != nil {
debug.Logf("failed to start new daemon: %v", err)
return false
}
// Reap the process to avoid zombies
go func() { _ = cmd.Wait() }()
// Wait for daemon to be ready using shared helper
if waitForSocketReadinessFn(socketPath, 5*time.Second) {
debug.Logf("new daemon started successfully")
return true
}
debug.Logf("new daemon failed to become ready")
fmt.Fprintf(os.Stderr, "%s Daemon restart timed out (>5s). Running in direct mode.\n", ui.RenderWarn("Warning:"))
fmt.Fprintf(os.Stderr, " %s Run 'bd doctor' to diagnose daemon issues\n", ui.RenderMuted("Hint:"))
return false
}
// isDaemonRunningQuiet checks if daemon is running without output
func isDaemonRunningQuiet(pidFile string) bool {
isRunning, _ := isDaemonRunningFn(pidFile)
return isRunning
}
// tryAutoStartDaemon attempts to start the daemon in the background
// Returns true if daemon was started successfully and socket is ready
func tryAutoStartDaemon(socketPath string) bool {
// Dolt backend doesn't need daemon - it has its own sync mechanism.
if isDoltBackend() {
return false
}
// For other backends, check SingleProcessOnly capability
if singleProcessOnlyBackend() {
return false
}
// Empty dbPath causes filepath.Dir("") to return "." which breaks lock
// file operations. This can happen when metadata.json has an empty database
// field and no beads.db file exists. Skip daemon start gracefully.
if dbPath == "" {
debugLog("skipping auto-start: no database path configured")
return false
}
if !canRetryDaemonStart() {
debugLog("skipping auto-start due to recent failures")
return false
}
if isDaemonHealthy(socketPath) {
debugLog("daemon already running and healthy")
return true
}
lockPath := socketPath + ".startlock"
if !acquireStartLock(lockPath, socketPath) {
return false
}
defer func() {
if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) {
debugLog("failed to remove lock file: %v", err)
}
}()
if handleExistingSocket(socketPath) {
return true
}
socketPath = determineSocketPath(socketPath)
return startDaemonProcessFn(socketPath)
}
func debugLog(msg string, args ...interface{}) {
debug.Logf(msg, args...)
}
func isDaemonHealthy(socketPath string) bool {
client, err := rpc.TryConnect(socketPath)
if err == nil && client != nil {
_ = client.Close()
return true
}
return false
}
func acquireStartLock(lockPath, socketPath string) bool {
if err := ensureLockDirectory(lockPath); err != nil {
debugLog("failed to ensure lock directory: %v", err)
return false
}
// Bounded retry loop to prevent infinite recursion when lock cleanup fails
const maxRetries = 3
for attempt := 0; attempt < maxRetries; attempt++ {
// nolint:gosec // G304: lockPath is derived from secure beads directory
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
if err == nil {
// Successfully acquired lock
_, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid())
_ = lockFile.Close() // Best-effort close during startup
return true
}
// Lock file exists - check if daemon is actually starting
lockPID, pidErr := readPIDFromFile(lockPath)
if pidErr != nil || !isPIDAlive(lockPID) {
// Stale lock from crashed process - clean up immediately (avoids 5s wait)
debugLog("startlock is stale (PID %d dead or unreadable), cleaning up", lockPID)
if rmErr := removeFileFn(lockPath); rmErr != nil {
debugLog("failed to remove stale lock file: %v", rmErr)
return false // Can't acquire lock if we can't clean up
}
// Continue to next iteration to retry lock acquisition
continue
}
// PID is alive - but is daemon actually running/starting?
// Use flock-based check as authoritative source (immune to PID reuse)
beadsDir := filepath.Dir(dbPath)
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
// Daemon lock not held - the start attempt failed or process was reused
debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID)
if rmErr := removeFileFn(lockPath); rmErr != nil {
debugLog("failed to remove orphaned lock file: %v", rmErr)
return false // Can't acquire lock if we can't clean up
}
// Continue to next iteration to retry lock acquisition
continue
}
// Daemon lock is held - daemon is legitimately starting, wait for socket
debugLog("another process (PID %d) is starting daemon, waiting for readiness", lockPID)
if waitForSocketReadiness(socketPath, 5*time.Second) {
return true
}
return handleStaleLock(lockPath)
}
debugLog("failed to acquire start lock after %d attempts", maxRetries)
return false
}
func handleStaleLock(lockPath string) bool {
lockPID, err := readPIDFromFile(lockPath)
// Check if PID is dead
if err != nil || !isPIDAlive(lockPID) {
debugLog("lock is stale (PID %d dead or unreadable), removing", lockPID)
if rmErr := removeFileFn(lockPath); rmErr != nil {
debugLog("failed to remove stale lock in handleStaleLock: %v", rmErr)
}
// Return false to let caller retry. DO NOT call tryAutoStartDaemon here
// to avoid infinite recursion: acquireStartLock -> handleStaleLock ->
// tryAutoStartDaemon -> acquireStartLock -> ...
return false
}
// PID is alive - but check daemon lock as authoritative source (immune to PID reuse)
beadsDir := filepath.Dir(dbPath)
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
debugLog("lock PID %d alive but daemon lock not held, removing", lockPID)
if rmErr := removeFileFn(lockPath); rmErr != nil {
debugLog("failed to remove orphaned lock in handleStaleLock: %v", rmErr)
}
// Return false to let caller retry. DO NOT call tryAutoStartDaemon here
// to avoid infinite recursion.
return false
}
// Daemon lock is held - daemon is genuinely running but socket isn't ready
// This shouldn't happen normally, but don't clean up a legitimate lock
return false
}
func handleExistingSocket(socketPath string) bool {
if _, err := os.Stat(socketPath); err != nil {
return false
}
if canDialSocket(socketPath, 200*time.Millisecond) {
debugLog("daemon started by another process")
return true
}
// Use flock-based check as authoritative source (immune to PID reuse)
// If daemon lock is not held, daemon is definitely dead regardless of PID file
beadsDir := filepath.Dir(dbPath)
if running, pid := lockfile.TryDaemonLock(beadsDir); running {
debugLog("daemon lock held (PID %d), waiting for socket", pid)
return waitForSocketReadiness(socketPath, 5*time.Second)
}
// Lock not held - daemon is dead, clean up stale artifacts
debugLog("socket is stale (daemon lock not held), cleaning up")
_ = os.Remove(socketPath) // Best-effort cleanup, file may not exist
pidFile := getPIDFileForSocket(socketPath)
if pidFile != "" {
_ = os.Remove(pidFile) // Best-effort cleanup, file may not exist
}
// Also clean up daemon.lock file (contains stale metadata)
lockFile := filepath.Join(beadsDir, "daemon.lock")
_ = os.Remove(lockFile) // Best-effort cleanup
return false
}
func determineSocketPath(socketPath string) string {
return socketPath
}
// ensureLockDirectory ensures the parent directory exists for the lock file.
// Needed when ShortSocketPath routes sockets into /tmp/beads-*/bd.sock.
func ensureLockDirectory(lockPath string) error {
dir := filepath.Dir(lockPath)
if dir == "" {
return fmt.Errorf("lock directory missing for %s", lockPath)
}
if _, err := os.Stat(dir); err == nil {
return nil
} else if !os.IsNotExist(err) {
return err
}
return os.MkdirAll(dir, 0o700)
}
func startDaemonProcess(socketPath string) bool {
// Dolt backend doesn't need daemon - it has its own sync mechanism.
if isDoltBackend() {
debugLog("dolt backend: skipping daemon start")
return false
}
// For other backends, check SingleProcessOnly capability
if singleProcessOnlyBackend() {
debugLog("single-process backend: skipping daemon start")
return false
}
// Early check: daemon requires a git repository (unless --local mode)
// Skip attempting to start and avoid the 5-second wait if not in git repo
if !isGitRepo() {
debugLog("not in a git repository, skipping daemon start")
if !quietFlag {
fmt.Fprintf(os.Stderr, "%s No git repository initialized - running without background sync\n", ui.RenderMuted("Note:"))
}
return false
}
binPath, err := executableFn()
if err != nil {
binPath = os.Args[0]
}
// Keep sqlite auto-start behavior unchanged: start the daemon via the public
// `bd daemon start` entrypoint (it will daemonize itself as needed).
args := []string{"daemon", "start"}
cmd := execCommandFn(binPath, args...)
// Mark this as a daemon-foreground child so we don't track/kill based on the
// short-lived launcher process PID (see computeDaemonParentPID()).
// Also force the daemon to bind the same socket we're probing for readiness,
// avoiding any mismatch between workspace-derived paths.
cmd.Env = append(os.Environ(),
"BD_DAEMON_FOREGROUND=1",
"BD_SOCKET="+socketPath,
)
setupDaemonIO(cmd)
if dbPath != "" {
cmd.Dir = filepath.Dir(dbPath)
}
configureDaemonProcessFn(cmd)
if err := cmd.Start(); err != nil {
recordDaemonStartFailure()
debugLog("failed to start daemon: %v", err)
return false
}
go func() { _ = cmd.Wait() }()
if waitForSocketReadinessFn(socketPath, 5*time.Second) {
recordDaemonStartSuccess()
return true
}
recordDaemonStartFailure()
debugLog("daemon socket not ready after 5 seconds")
// Check for daemon-error file which contains the actual failure reason
beadsDir := filepath.Dir(dbPath)
errFile := filepath.Join(beadsDir, "daemon-error")
if errContent, err := os.ReadFile(errFile); err == nil && len(errContent) > 0 {
// Show the actual error from the daemon
fmt.Fprintf(os.Stderr, "%s Daemon failed to start:\n", ui.RenderWarn("Warning:"))
fmt.Fprintf(os.Stderr, "%s\n", string(errContent))
return false
}
// Emit visible warning so user understands why command was slow
fmt.Fprintf(os.Stderr, "%s Daemon took too long to start (>5s). Running in direct mode.\n", ui.RenderWarn("Warning:"))
fmt.Fprintf(os.Stderr, " %s Run 'bd doctor' to diagnose daemon issues\n", ui.RenderMuted("Hint:"))
return false
}
func setupDaemonIO(cmd *exec.Cmd) {
devNull, err := openFileFn(os.DevNull, os.O_RDWR, 0)
if err == nil {
cmd.Stdout = devNull
cmd.Stderr = devNull
cmd.Stdin = devNull
go func() {
time.Sleep(1 * time.Second)
_ = devNull.Close()
}()
}
}
// getPIDFileForSocket returns the PID file path.
// Note: socketPath parameter is unused - PID file is always in .beads directory
// (not socket directory, which may be in /tmp for short paths).
func getPIDFileForSocket(_ string) string {
dir := filepath.Dir(dbPath)
return filepath.Join(dir, "daemon.pid")
}
// readPIDFromFile reads a PID from a file
func readPIDFromFile(path string) (int, error) {
// nolint:gosec // G304: path is derived from secure beads directory
data, err := os.ReadFile(path)
if err != nil {
return 0, err
}
pid, err := strconv.Atoi(strings.TrimSpace(string(data)))
if err != nil {
return 0, err
}
return pid, nil
}
// isPIDAlive checks if a process with the given PID is running
func isPIDAlive(pid int) bool {
if pid <= 0 {
return false
}
return isProcessRunning(pid)
}
// canDialSocket attempts a quick dial to the socket with a timeout
func canDialSocket(socketPath string, timeout time.Duration) bool {
client, err := rpc.TryConnectWithTimeout(socketPath, timeout)
if err != nil || client == nil {
return false
}
_ = client.Close() // Best-effort close after health check
return true
}
// waitForSocketReadiness waits for daemon socket to be ready by testing actual connections
//
//nolint:unparam // timeout is configurable even though current callers use 5s
func waitForSocketReadiness(socketPath string, timeout time.Duration) bool {
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
if canDialSocket(socketPath, 200*time.Millisecond) {
return true
}
time.Sleep(100 * time.Millisecond)
}
return false
}
func canRetryDaemonStart() bool {
if daemonStartFailures == 0 {
return true
}
// Exponential backoff: 5s, 10s, 20s, 40s, 80s, 120s (capped at 120s)
backoff := time.Duration(5*(1<<uint(daemonStartFailures-1))) * time.Second
if backoff > 120*time.Second {
backoff = 120 * time.Second
}
return time.Since(lastDaemonStartAttempt) > backoff
}
func recordDaemonStartSuccess() {
daemonStartFailures = 0
}
func recordDaemonStartFailure() {
lastDaemonStartAttempt = time.Now()
daemonStartFailures++
// No cap needed - backoff is capped at 120s in canRetryDaemonStart
}
// getSocketPath returns the daemon socket path based on the database location.
// If BD_SOCKET env var is set, uses that value instead (enables test isolation).
// On Unix systems, uses rpc.ShortSocketPath to avoid exceeding socket path limits
// (macOS: 104 chars) by relocating long paths to /tmp/beads-{hash}/ (GH#1001).
func getSocketPath() string {
// Check environment variable first (enables test isolation)
if socketPath := os.Getenv("BD_SOCKET"); socketPath != "" {
return socketPath
}
// Get workspace path (parent of .beads directory)
beadsDir := filepath.Dir(dbPath)
workspacePath := filepath.Dir(beadsDir)
return rpc.ShortSocketPath(workspacePath)
}
// emitVerboseWarning prints a one-line warning when falling back to direct mode
func emitVerboseWarning() {
switch daemonStatus.FallbackReason {
case FallbackConnectFailed:
fmt.Fprintf(os.Stderr, "Warning: Daemon unreachable at %s. Running in direct mode. Hint: bd daemon status\n", daemonStatus.SocketPath)
case FallbackHealthFailed:
fmt.Fprintf(os.Stderr, "Warning: Daemon unhealthy. Falling back to direct mode. Hint: bd daemon status --all\n")
case FallbackAutoStartDisabled:
fmt.Fprintf(os.Stderr, "Warning: Auto-start disabled (BEADS_AUTO_START_DAEMON=false). Running in direct mode. Hint: bd daemon\n")
case FallbackAutoStartFailed:
fmt.Fprintf(os.Stderr, "Warning: Failed to auto-start daemon. Running in direct mode. Hint: bd daemon status\n")
case FallbackDaemonUnsupported:
fmt.Fprintf(os.Stderr, "Warning: Daemon does not support this command yet. Running in direct mode. Hint: update daemon or use local mode.\n")
case FallbackWorktreeSafety:
// Don't warn - this is expected behavior. User can configure sync-branch to enable daemon.
return
case FallbackSingleProcessOnly:
// Don't warn - daemon is intentionally disabled for single-process backends (e.g., Dolt).
return
case FallbackFlagNoDaemon:
// Don't warn when user explicitly requested --no-daemon
return
}
}
func getDebounceDuration() time.Duration {
duration := config.GetDuration("flush-debounce")
if duration == 0 {
// If parsing failed, use default
return 5 * time.Second
}
return duration
}