From 7fa0c931953e323cd4cfa2e784e573af13e332f4 Mon Sep 17 00:00:00 2001 From: Steve Yegge Date: Fri, 7 Nov 2025 18:57:43 -0800 Subject: [PATCH] Fix daemon orphaning: track parent PID and exit when parent dies - Add ParentPID field to DaemonLockInfo struct - Daemon monitors parent process every 10 seconds - Gracefully exits when parent process dies - Prevents accumulation of orphaned daemons from dead sessions - Fixes race conditions from multiple daemons on same database Closes bd-zpnq --- cmd/bd/daemon.go | 12 ++++++---- cmd/bd/daemon_event_loop.go | 17 ++++++++++++++ cmd/bd/daemon_lock.go | 2 ++ cmd/bd/daemon_parent_test.go | 45 ++++++++++++++++++++++++++++++++++++ cmd/bd/daemon_server.go | 34 ++++++++++++++++++++++++++- 5 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 cmd/bd/daemon_parent_test.go diff --git a/cmd/bd/daemon.go b/cmd/bd/daemon.go index 7d773efc..532275e4 100644 --- a/cmd/bd/daemon.go +++ b/cmd/bd/daemon.go @@ -372,6 +372,10 @@ func runDaemonLoop(interval time.Duration, autoCommit, autoPush bool, logPath, p doSync := createSyncFunc(ctx, store, autoCommit, autoPush, log) doSync() + // Get parent PID for monitoring (exit if parent dies) + parentPID := os.Getppid() + log.log("Monitoring parent process (PID %d)", parentPID) + // Choose event loop based on BEADS_DAEMON_MODE daemonMode := os.Getenv("BEADS_DAEMON_MODE") if daemonMode == "" { @@ -385,18 +389,18 @@ func runDaemonLoop(interval time.Duration, autoCommit, autoPush bool, logPath, p if jsonlPath == "" { log.log("Error: JSONL path not found, cannot use event-driven mode") log.log("Falling back to polling mode") - runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, log) + runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, parentPID, log) } else { // Event-driven mode uses separate export-only and import-only functions doExport := createExportFunc(ctx, store, autoCommit, autoPush, log) doAutoImport := createAutoImportFunc(ctx, store, log) - runEventDrivenLoop(ctx, cancel, server, serverErrChan, store, jsonlPath, doExport, doAutoImport, log) + runEventDrivenLoop(ctx, cancel, server, serverErrChan, store, jsonlPath, doExport, doAutoImport, parentPID, log) } case "poll": log.log("Using polling mode (interval: %v)", interval) - runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, log) + runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, parentPID, log) default: log.log("Unknown BEADS_DAEMON_MODE: %s (valid: poll, events), defaulting to poll", daemonMode) - runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, log) + runEventLoop(ctx, cancel, ticker, doSync, server, serverErrChan, parentPID, log) } } diff --git a/cmd/bd/daemon_event_loop.go b/cmd/bd/daemon_event_loop.go index da7c442a..da7f3c02 100644 --- a/cmd/bd/daemon_event_loop.go +++ b/cmd/bd/daemon_event_loop.go @@ -15,6 +15,7 @@ import ( // - File system changes (JSONL modifications) // - RPC mutations (create, update, delete) // - Git operations (via hooks, optional) +// - Parent process monitoring (exit if parent dies) func runEventDrivenLoop( ctx context.Context, cancel context.CancelFunc, @@ -24,6 +25,7 @@ func runEventDrivenLoop( jsonlPath string, doExport func(), doAutoImport func(), + parentPID int, log daemonLogger, ) { sigChan := make(chan os.Signal, 1) @@ -83,6 +85,10 @@ func runEventDrivenLoop( healthTicker := time.NewTicker(60 * time.Second) defer healthTicker.Stop() + // Parent process check (every 10 seconds) + parentCheckTicker := time.NewTicker(10 * time.Second) + defer parentCheckTicker.Stop() + // Dropped events safety net (faster recovery than health check) droppedEventsTicker := time.NewTicker(1 * time.Second) defer droppedEventsTicker.Stop() @@ -101,6 +107,17 @@ func runEventDrivenLoop( // Periodic health validation (not sync) checkDaemonHealth(ctx, store, log) + case <-parentCheckTicker.C: + // Check if parent process is still alive + if !checkParentProcessAlive(parentPID) { + log.log("Parent process (PID %d) died, shutting down daemon", parentPID) + cancel() + if err := server.Stop(); err != nil { + log.log("Error stopping server: %v", err) + } + return + } + case <-func() <-chan time.Time { if fallbackTicker != nil { return fallbackTicker.C diff --git a/cmd/bd/daemon_lock.go b/cmd/bd/daemon_lock.go index 2ac2a4a8..5be10a23 100644 --- a/cmd/bd/daemon_lock.go +++ b/cmd/bd/daemon_lock.go @@ -16,6 +16,7 @@ var ErrDaemonLocked = errors.New("daemon lock already held by another process") // DaemonLockInfo represents the metadata stored in the daemon.lock file type DaemonLockInfo struct { PID int `json:"pid"` + ParentPID int `json:"parent_pid,omitempty"` // Parent process ID (0 if not tracked) Database string `json:"database"` Version string `json:"version"` StartedAt time.Time `json:"started_at"` @@ -63,6 +64,7 @@ func acquireDaemonLock(beadsDir string, dbPath string) (*DaemonLock, error) { // Write JSON metadata to the lock file lockInfo := DaemonLockInfo{ PID: os.Getpid(), + ParentPID: os.Getppid(), Database: dbPath, Version: Version, StartedAt: time.Now().UTC(), diff --git a/cmd/bd/daemon_parent_test.go b/cmd/bd/daemon_parent_test.go new file mode 100644 index 00000000..c94ae48b --- /dev/null +++ b/cmd/bd/daemon_parent_test.go @@ -0,0 +1,45 @@ +package main + +import ( + "os/exec" + "path/filepath" + "testing" +) + +// TestDaemonExitsWhenParentDies verifies that the daemon exits when its parent process dies +func TestDaemonExitsWhenParentDies(t *testing.T) { + if testing.Short() { + t.Skip("Skipping integration test in short mode") + } + + t.Skip("Manual test - requires daemon to be run externally") + + // This is a manual test scenario: + // 1. Start a shell process that spawns the daemon + // 2. Verify daemon tracks parent PID + // 3. Kill the shell process + // 4. Verify daemon exits within 10-15 seconds + // + // To test manually: + // $ sh -c 'bd daemon --interval 5s & sleep 100' & + // $ SHELL_PID=$! + // $ # Check daemon.lock has parent_pid set to SHELL_PID + // $ kill $SHELL_PID + // $ # Daemon should exit within 10-15 seconds +} + +func mustAbs(t *testing.T, path string) string { + abs, err := filepath.Abs(path) + if err != nil { + t.Fatalf("Failed to get absolute path: %v", err) + } + return abs +} + +func runGitCmd(t *testing.T, dir string, args ...string) { + cmd := exec.Command("git", args...) + cmd.Dir = dir + if err := cmd.Run(); err != nil { + t.Fatalf("git %v failed: %v", args, err) + } +} diff --git a/cmd/bd/daemon_server.go b/cmd/bd/daemon_server.go index e8e1306c..dc340edc 100644 --- a/cmd/bd/daemon_server.go +++ b/cmd/bd/daemon_server.go @@ -73,12 +73,34 @@ func runGlobalDaemon(log daemonLogger) { log.log("Global daemon stopped") } +// checkParentProcessAlive checks if the parent process is still running. +// Returns true if parent is alive, false if it died. +// Returns true if parent PID is 0 or 1 (not tracked, or adopted by init). +func checkParentProcessAlive(parentPID int) bool { + if parentPID == 0 { + // Parent PID not tracked (older lock files) + return true + } + + if parentPID == 1 { + // Adopted by init - parent died + return false + } + + // Check if parent process is running + return isProcessRunning(parentPID) +} + // runEventLoop runs the daemon event loop (polling mode) -func runEventLoop(ctx context.Context, cancel context.CancelFunc, ticker *time.Ticker, doSync func(), server *rpc.Server, serverErrChan chan error, log daemonLogger) { +func runEventLoop(ctx context.Context, cancel context.CancelFunc, ticker *time.Ticker, doSync func(), server *rpc.Server, serverErrChan chan error, parentPID int, log daemonLogger) { sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, daemonSignals...) defer signal.Stop(sigChan) + // Parent process check (every 10 seconds) + parentCheckTicker := time.NewTicker(10 * time.Second) + defer parentCheckTicker.Stop() + for { select { case <-ticker.C: @@ -86,6 +108,16 @@ func runEventLoop(ctx context.Context, cancel context.CancelFunc, ticker *time.T return } doSync() + case <-parentCheckTicker.C: + // Check if parent process is still alive + if !checkParentProcessAlive(parentPID) { + log.log("Parent process (PID %d) died, shutting down daemon", parentPID) + cancel() + if err := server.Stop(); err != nil { + log.log("Error stopping server: %v", err) + } + return + } case sig := <-sigChan: if isReloadSignal(sig) { log.log("Received reload signal, ignoring (daemon continues running)")